data_file = read.csv(“realestate_texas.csv”)
str(data_file)
data_file\(date = as.Date(paste(data_file\)year, data_file$month, “01”, sep = “-”))
str(data_file$date)
library(moments)
quant_var = c(“sales”, “volume”, “median_price”, “listings”, “months_inventory”)
summary(data_file[quant_var]) # minimo, massimo, quartili, mediana, media
apply(data_file[quant_var], 2, function(x){any(x) <= 0}) # controllo zeri
apply(data_file[quant_var], 2, function(x){exp(mean(log(x)))}) # media geometrica
apply(data_file[quant_var], 2, var) # varianza
apply(data_file[quant_var], 2, sd) # deviazione standard
apply(data_file[quant_var], 2, function(x){(sd(x)/mean(x))*100}) # coefficiente di variazione
apply(data_file[quant_var], 2, function(x){max(x)-min(x)}) # range
apply(data_file[quant_var], 2, IQR) # differenza interquartile
apply(data_file[quant_var], 2, skewness) # asimmetria
apply(data_file[quant_var], 2, kurtosis) # curtosi (centrata sul 3)
cat_var = c(“city”, “date”)
date_city_freq_dist = lapply(cat_var, function(x){ n = table(data_file[[x]]) # frequenza assoluta f = prop.table(n) # frequenza relativa data_file_out = data.frame( variable_name = names(n), ni = as.vector(n), fi = as.vector(f) ) data_file_out\(Ni = cumsum(data_file_out\)ni) # frequenza cumulata data_file_out\(Fi = cumsum(data_file_out\)fi) # frequenza relativa cumulata return(data_file_out) })
sales_classes = cut(data_file$sales, breaks = c(1,100,200,300,400,500), labels = c(“1-100”, “101-200”, “201-300”, “301-400”, “401-500”), right = TRUE, include.lowest = TRUE)
s_ni = table(sales_classes) s_fi = prop.table(s_ni) s_Ni = cumsum(s_ni) s_Fi = cumsum(s_fi)
sales_freq_dist = data.frame(Class = names(s_ni), ni = as.numeric(s_ni), fi = round(as.numeric(s_fi), 4), Ni = as.numeric(s_Ni), Fi = round(as.numeric(s_Fi), 4) )
sales_barplot = barplot(height = sales_freq_dist\(fi, names.arg = sales_freq_dist\)Class, col = “lightblue”, main = “Distribuzione delle vendite per classe”, xlab = “Classi”, ylab = “Frequenza relativa”, border = “black”, ylim = c(0,0.6) )
text(x = sales_barplot, y = sales_freq_dist\(fi, label = round(sales_freq_dist\)fi, 3), pos = 3 )
Gini.index = function(x){ ni = table(x) fi = table(x)/length(x) fi2 = fi^2 J = length(table(x)) Gini = 1-sum(fi2) norm_Gini = Gini/((J-1)/J) return(norm_Gini) }
Gini.index(data_file$sales)
Gini.index(sales_classes)
p_Beaumont = sum(data_file$city == “Beaumont”)/nrow(data_file)
p_July = sum(data_file$month == “7”)/nrow(data_file)
p_Dec2012 = sum(data_file$date == “2012-12-01”)/nrow(data_file)
data_file\(average_price = (data_file\)volume*10^6)/data_file$sales
sales_listings_ratio = data_file\(sales/data_file\)listings
price_score = data_file\(median_price/max(data_file\)median_price)
time_score = 1/data_file$months_inventory
data_file$efficacy = (sales_listings_ratio + price_score + time_score)/3
summary(data_file$efficacy)
var(data_file$efficacy)
sd(data_file$efficacy)
(sd(data_file\(efficacy)/mean(data_file\)efficacy))*100
skewness(data_file$efficacy)
kurtosis(data_file$efficacy)
library(dplyr)
city_summary = data_file %>% group_by(city) %>% summarise(sales_mean = mean(sales), sales_sd = sd(sales), volume_mean = mean(volume), volume_sd = sd(volume), median_price_mean = mean(median_price), median_price_sd = sd(median_price), listings_mean = mean(listings), listings_sd = sd(listings), months_inventory_mean = mean(months_inventory), months_inventory_sd = sd(months_inventory))
year_summary = data_file %>% group_by(year) %>% summarise(sales_mean = mean(sales), sales_sd = sd(sales), volume_mean = mean(volume), volume_sd = sd(volume), median_price_mean = mean(median_price), median_price_sd = sd(median_price), listings_mean = mean(listings), listings_sd = sd(listings), months_inventory_mean = mean(months_inventory), months_inventory_sd = sd(months_inventory))
month_summary = data_file %>% group_by(month) %>% summarise(sales_mean = mean(sales), sales_sd = sd(sales), volume_mean = mean(volume), volume_sd = sd(volume), median_price_mean = mean(median_price), median_price_sd = sd(median_price), listings_mean = mean(listings), listings_sd = sd(listings), months_inventory_mean = mean(months_inventory), months_inventory_sd = sd(months_inventory))
library(ggplot2)
library(tidyr)
s_l_city_plot = city_summary %>% select(city, sales_mean, listings_mean) %>% pivot_longer(cols = c(sales_mean, listings_mean), names_to = “variable”, values_to = “value”)
ggplot(s_l_city_plot, aes(x = city, y = value, fill = variable)) + geom_col(position = position_dodge(width = 0.8)) + geom_text(aes(label = round(value, 1)), position = position_dodge(width = 0.8), vjust = -0.3, size = 3.5) + labs(title = “Vendite/Annunci per città”, x = “Città”, y = “Media di Vendite/Annunci”) + scale_fill_manual(values = c(“sales_mean” = “lightblue”, “listings_mean” = “darkred”), labels = c(“Annunci”, “Vendite”)) + theme_classic(base_size = 10) + theme( axis.text.x = element_text(angle = 0, vjust = 0.5), legend.title = element_blank(), plot.title = element_text(face = “bold”) )
ggplot(city_summary, aes(x = city, y = median_price_mean, fill = median_price_mean)) + geom_col() + geom_errorbar(aes(ymin = median_price_mean - median_price_sd, ymax = median_price_mean + median_price_sd), width = 0.2, color = “black”) + geom_text(aes(label = round(median_price_mean, 0)), vjust = -3, size = 3.5) + scale_fill_gradient(low = “lightgreen”, high = “darkgreen”) + labs(title = “Prezzo Mediano per città”, x = “Città”, y = “Prezzo Mediano Medio”) + scale_y_continuous(breaks = seq(0, 160000, by=40000), limits = c(0, 180000)) + theme_classic(base_size = 10) + theme( axis.text.x = element_text(angle = 0, vjust = 0.5), legend.position = “none”, plot.title = element_text(face = “bold”) )
ggplot(city_summary, aes(x = city, y = months_inventory_mean, fill = months_inventory_mean)) + geom_col() + geom_errorbar(aes(ymin = months_inventory_mean - months_inventory_sd, ymax = months_inventory_mean + months_inventory_sd), width = 0.2, color = “black”) + geom_text(aes(label = round(months_inventory_mean, 1)), vjust = -0.5, size = 5) + scale_fill_gradient(low = “lightpink”, high = “darkred”) + scale_y_continuous(breaks = seq(0, 14, by=2), limits = c(0, 14)) + labs(title = “Tempistiche di Vendita per città”, x = “Città”, y = “Mesi impiegati per vendere totale immobili”) + theme_classic(base_size = 10) + theme( axis.text.x = element_text(angle = 0, vjust = 0.5), legend.position = “none”, plot.title = element_text(face = “bold”) )
ggplot(year_summary, aes(x = year, y = sales_mean)) + geom_line(color = “red”, size = 0.8) + geom_point(color = “darkred”, size = 3) + geom_text(aes(label = round(sales_mean, 0)), vjust = -2, size = 3.5) + labs(title = “Andamento temporale delle Vendite”, x = “Anno”, y = “Media delle Vendite”) + scale_y_continuous(breaks = seq(100, 250, by=50), limits = c(100, 260)) + theme_bw(base_size = 12) + theme( plot.title = element_text(face = “bold”), axis.text.x = element_text(angle = 0, vjust = 0.5) )
ggplot(year_summary, aes(x = year, y = median_price_mean)) + geom_line(color = “lightblue”, size = 0.8) + geom_point(color = “darkblue”, size = 3) + geom_text(aes(label = round(median_price_mean, 0)), vjust = -2, size = 3.5) + labs(title = “Andamento temporale del Prezzo Mediano”, x = “Anno”, y = “Media del Prezzo Mediano”) + scale_y_continuous(breaks = seq(100000, 160000, by=20000), limits = c(100000, 160000)) + theme_bw(base_size = 12) + theme( plot.title = element_text(face = “bold”), axis.text.x = element_text(angle = 0, vjust = 0.5) )
ggplot(month_summary, aes(x = month, y = sales_mean)) + geom_line(color = “violet”, size = 1) + geom_point(color = “darkviolet”, size = 3) + geom_text(aes(label = round(sales_mean, 1)), vjust = -2, size = 3) + labs( title = “Vendite Mensili”, x = “Mese”, y = “Media delle Vendite” ) + scale_x_continuous(breaks = seq(0, 12, by=1), limits = c(1, 12)) + scale_y_continuous(breaks = seq(100, 250, by=50), limits = c(100, 280)) + theme_bw(base_size = 12) + theme( plot.title = element_text(face = “bold”), axis.text.x = element_text(angle = 0, vjust = 0.5) )
ggplot(month_summary, aes(x = month)) + geom_line(aes(y = median_price_mean, color = “Prezzo Mediano Medio”), size = 1) + geom_line(aes(y = sales_mean * 1000, color = “Media Vendite”), size = 1) + geom_text(aes(y = median_price_mean, label = round(median_price_mean, 0), color = “Prezzo Mediano Medio”), vjust = -0.5, size = 3) + geom_text(aes(y = sales_mean * 1000, label = round(sales_mean, 0), color = “Media Vendite”), vjust = -0.5, size = 3) + scale_color_manual( name = “Variabili:”, values = c(“Prezzo Mediano Medio” = “blue”, “Media Vendite” = “red”)) + scale_y_continuous( name = “Media del Prezzo Mediano”, breaks = seq(125000, 250000, 50000), limits = c(120000, 250000), sec.axis = sec_axis(~ . / 1000, name = “Media delle Vendite”)) + labs( title = “Prezzi vs Vendite”, x = “Mese”) + scale_x_continuous(breaks = seq(1, 12, 1), limits = c(1, 13)) + theme_bw(base_size = 12) + theme(plot.title = element_text(face = “bold”), axis.text.x = element_text(angle = 0, vjust = 0.5), legend.position = “bottom”)
month_plot = month_summary %>% select(month, median_price_mean, volume_mean) %>% pivot_longer(cols = c(median_price_mean, volume_mean), names_to = “variable”, values_to = “value”)
max_price = max(month_summary\(median_price_mean) max_volume = max(month_summary\)volume_mean) scale_factor = max_price / max_volume
ggplot(month_plot, aes(x = month)) + geom_line(aes(y = ifelse(variable == “volume_mean”, value * scale_factor, value), color = variable), size = 1) + geom_point(aes(y = ifelse(variable == “volume_mean”, value * scale_factor, value), color = variable), size = 3) + geom_text(aes(y = ifelse(variable == “volume_mean”, value * scale_factor, value), label = round(value, 1), color = variable), vjust = -1, size = 3, check_overlap = FALSE) + scale_color_manual( name = “Variabili”, values = c(“median_price_mean” = “blue”, “volume_mean” = “red”), labels = c(“Prezzo Mediano Medio”, “Ricavato Totale Medio delle Vendite (Milioni USD)”)) + scale_y_continuous( name = “Prezzo Mediano Medio (USD)”, sec.axis = sec_axis(~ . / scale_factor, name = “Ricavato Totale Medio delle Vendite (Milioni USD)”)) + scale_x_continuous(breaks = 1:12) + labs(title = “Prezzo Mediano vs Ricavato Totale delle Vendite Mensili”, x = “Mese”) + theme_bw(base_size = 12) + theme( plot.title = element_text(face = “bold”), axis.text.x = element_text(angle = 0, vjust = 0.5), legend.position = “bottom”)
ggplot(data_file, aes(x = city, y = median_price, fill = city)) + geom_boxplot() + labs( title = “Distribuzione del Prezzo Mediano per Città”, x = “Città”, y = “Prezzo Mediano (USD)”) + theme_classic(base_size = 10) + theme( axis.text.x = element_text(angle = 0, hjust = 0.5), legend.position = “none”, plot.title = element_text(face = “bold”))
ggplot(data_file, aes(x = city, y = volume, fill = city)) + geom_boxplot(outlier.colour = “black”, outlier.size = 1.5) + labs( title = “Distribuzione del Ricavato Totale delle Vendite per Città”, x = “Città”, y = “Ricavato delle Vendite (Milioni USD)”) + theme_classic(base_size = 10) + theme( legend.position = “none”, plot.title = element_text(face = “bold”), axis.text.x = element_text(angle = 0, vjust = 0.5))
ggplot(data_file, aes(x = factor(year), y = volume)) + geom_boxplot(fill = “lightblue”, outlier.colour = “black”, outlier.size = 1.5) + labs( title = “Distribuzione del Ricavato Totale delle Vendite per Anno”, x = “Anno”, y = “Ricavato delle Vendite (Milioni USD)”) + theme_classic(base_size = 10) + theme( plot.title = element_text(face = “bold”), axis.text.x = element_text(angle = 0, hjust = 0.5))
ggplot(data_file, aes(x = month, y = sales, fill = city)) + geom_col(position = position_dodge(width = 0.8)) + scale_x_continuous(breaks = 1:12) + labs( title = “Totale Vendite Mensili per Città”, x = “Mese”, y = “Numero di Vendite”, fill = “Città”) + theme_classic(base_size = 10) + theme( plot.title = element_text(face = “bold”), axis.text.x = element_text(angle = 0, vjust = 0.5))
sales_month_city = data_file %>% group_by(month, city) %>% summarise(total_sales = sum(sales), .groups = “drop”)
ggplot(sales_month_city, aes(x = factor(month), y = total_sales, fill = city)) + geom_col() + scale_x_discrete(drop = FALSE) + labs( title = “Totale delle Vendite Mensili per Città”, x = “Mese”, y = “Numero Totale di Vendite”, fill = “Città”) + theme_classic(base_size = 10) + theme( plot.title = element_text(face = “bold”), axis.text.x = element_text(angle = 0))
ggplot(sales_month_city, aes(x = factor(month), y = total_sales, fill = city)) + geom_col(position = “fill”) + scale_y_continuous(labels = scales::percent) + scale_x_discrete(drop = FALSE) + labs( title = “Distribuzione Percentuale delle Vendite Mensili per Città”, x = “Mese”, y = “Quota Percentuale delle Vendite”, fill = “Città”) + theme_classic(base_size = 10) + theme( plot.title = element_text(face = “bold”), axis.text.x = element_text(angle = 0))
ggplot(data_file, aes(x = date, y = sales, color = city)) + geom_line(size = 1) + labs( title = “Andamento Storico delle Vendite Immobiliari in Texas”, x = “Data”, y = “Numero di Vendite”, color = “Città”) + scale_x_date(date_labels = “%Y-%m”, date_breaks = “3 months”, limits = as.Date(c(“2010-01-01”, max(data_file$date)))) + theme_bw(base_size = 10) + theme( plot.title = element_text(face = “bold”, hjust = 0.5), axis.text.x = element_text(angle = 60, hjust = 1), legend.position = “bottom”)