library(patchwork)
library(dplyr)
##
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
##
## filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(moments)
df = read.csv("realestate_texas.csv")
The most important thing with a new dataset is to visualize it, so it’s possible to understand the type of data and structure an analysis:
df$year = as.factor(df$year)
head(df, 5)
## city year month sales volume median_price listings months_inventory
## 1 Beaumont 2010 1 83 14.162 163800 1533 9.5
## 2 Beaumont 2010 2 108 17.690 138200 1586 10.0
## 3 Beaumont 2010 3 182 28.701 122400 1689 10.6
## 4 Beaumont 2010 4 200 26.819 123200 1708 10.6
## 5 Beaumont 2010 5 202 28.833 123100 1771 10.9
as.data.frame(table(df["year"]))
## year Freq
## 1 2010 48
## 2 2011 48
## 3 2012 48
## 4 2013 48
## 5 2014 48
as.data.frame(table(df["city"]), col.names = c("City Name", "Absolute Frequency"))
## city Freq
## 1 Beaumont 60
## 2 Bryan-College Station 60
## 3 Tyler 60
## 4 Wichita Falls 60
We have evenly distibuted data, so we can assert that the data will have the same sensibility on both year and city.
Now let’s check the position index for “sales”, “volume”, “Median Price”, “Listings”, “Month in inventory”:
sales_max = max(df["sales"])
sales_min = min(df["sales"])
sales_median = median(df$sales)
sales_mean = round(mean(df$sales, 2))
sales_q1 = quantile(df$sales)[2]
sales_q2 = quantile(df$sales)[3]
sales_q3 = quantile(df$sales)[4]
sales_vect = c(sales_max, sales_min, sales_mean, sales_median, sales_q1, sales_q2, sales_q3)
sales_matrix = matrix(sales_vect, nrow = 1, ncol = length(sales_vect))
colnames(sales_matrix) = c("Max", "Min", "Mean", "Median", "Q1", "Q2", "Q3")
rownames(sales_matrix) = c("Sales")
volume_sorted = sort(df$volume)
volume_vect = c(round(volume_sorted[length(volume_sorted)], 2), round(volume_sorted[1], 2), round(mean(volume_sorted), 2), round(volume_sorted[((length(volume_sorted)/2) + ((length(volume_sorted)/2) + 1))/2], 2), round(volume_sorted[length(volume_sorted)/4], 2), round(volume_sorted[((length(volume_sorted)/2) + ((length(volume_sorted)/2) + 1))/2], 2), round(volume_sorted[(3*length(volume_sorted))/4], 2))
volume_matrix = matrix(volume_vect, nrow = 1, ncol = length(volume_vect))
colnames(volume_matrix) = c("Max", "Min", "Mean", "Median", "Q1", "Q2", "Q3")
rownames(volume_matrix) = c("Volume")
price_sorted = sort(df$median_price)
price_vect = c(round(price_sorted[length(price_sorted)], 2), round(price_sorted[1], 2), round(mean(price_sorted), 2), round(price_sorted[((length(price_sorted)/2) + ((length(price_sorted)/2) + 1))/2], 2), round(price_sorted[length(price_sorted)/4], 2), round(price_sorted[((length(price_sorted)/2) + ((length(price_sorted)/2) + 1))/2], 2), round(price_sorted[(3*length(price_sorted))/4], 2))
price_matrix = matrix(price_vect, nrow = 1, ncol = length(price_vect))
colnames(price_matrix) = c("Max", "Min", "Mean", "Median", "Q1", "Q2", "Q3")
rownames(price_matrix) = c("Median Price")
list_sorted = sort(df$listings)
list_vect = c(round(list_sorted[length(list_sorted)], 2), round(list_sorted[1], 2), round(mean(list_sorted), 2), round(list_sorted[((length(list_sorted)/2) + ((length(list_sorted)/2) + 1))/2], 2), round(list_sorted[length(list_sorted)/4], 2), round(list_sorted[((length(list_sorted)/2) + ((length(list_sorted)/2) + 1))/2], 2), round(list_sorted[(3*length(list_sorted))/4], 2))
list_matrix = matrix(list_vect, nrow = 1, ncol = length(list_vect))
colnames(list_matrix) = c("Max", "Min", "Mean", "Median", "Q1", "Q2", "Q3")
rownames(list_matrix) = c("Listings")
inv_sorted = sort(df$months_inventory)
inv_vect = c(round(inv_sorted[length(inv_sorted)], 2), round(inv_sorted[1], 2), round(mean(inv_sorted), 2), round(inv_sorted[((length(inv_sorted)/2) + ((length(inv_sorted)/2) + 1))/2], 2), round(inv_sorted[length(inv_sorted)/4], 2), round(inv_sorted[((length(inv_sorted)/2) + ((length(inv_sorted)/2) + 1))/2], 2), round(inv_sorted[(3*length(inv_sorted))/4], 2))
inv_matrix = matrix(inv_vect, nrow = 1, ncol = length(inv_vect))
colnames(inv_matrix) = c("Max", "Min", "Mean", "Median", "Q1", "Q2", "Q3")
rownames(inv_matrix) = c("Month in inventory")
positional_indexes_matrix = rbind(sales_matrix, volume_matrix, price_matrix, list_matrix, inv_matrix)
print(positional_indexes_matrix)
## Max Min Mean Median Q1 Q2
## Sales 423.00 79.00 176.00 175.50 127.00 175.50
## Volume 83.55 8.17 31.01 26.96 17.57 26.96
## Median Price 180000.00 73800.00 132665.42 134500.00 116700.00 134500.00
## Listings 3296.00 743.00 1738.02 1617.00 1022.00 1617.00
## Month in inventory 14.90 3.40 9.19 8.90 7.80 8.90
## Q3
## Sales 247.00
## Volume 40.88
## Median Price 150000.00
## Listings 1984.00
## Month in inventory 10.90
“Sales” and “Month in inventory” have the most simmetrical distribution, infact mean and median values are very similar. Instead “Listings” and “Volume” have mean bigger than the median so it’s very likely that they have a distribution shifted to the right, with more outliers on the right of the distribution. “Median Price” has the opposite behavior, infact the distribution is very likely to be shifted on the left, with outlier on the smaller side of the values, with a median bigger than the mean.
Positional indexes grouping by city, so we can understand the difference between the cities:
summary_city_sales = df %>%
group_by(city) %>%
summarise(
Sales_mean = round(mean(sales), 2), Sales_median = median(sales), Sales_max = max(sales), Sales_min = min(sales), Total_Sales = sum(sales))
summary_city_sales
## # A tibble: 4 × 6
## city Sales_mean Sales_median Sales_max Sales_min Total_Sales
## <chr> <dbl> <dbl> <int> <int> <int>
## 1 Beaumont 177. 176. 273 83 10643
## 2 Bryan-College Station 206. 186. 403 89 12358
## 3 Tyler 270. 271 423 143 16185
## 4 Wichita Falls 116. 114. 167 79 6964
summary_city_price = df %>%
group_by(city) %>%
summarise(
Median_Price_mean = round(mean(median_price), 2), Median_Price_max = max(median_price), Median_Price_min = min(median_price))
summary_city_price
## # A tibble: 4 × 4
## city Median_Price_mean Median_Price_max Median_Price_min
## <chr> <dbl> <dbl> <dbl>
## 1 Beaumont 129988. 163800 106700
## 2 Bryan-College Station 157488. 180000 140700
## 3 Tyler 141442. 161600 120600
## 4 Wichita Falls 101743. 135300 73800
It’s possible to see above that Wichita Falls has the lowest sales by at least 30% and Tyler has the highest sales. Also Tyler has a symmetrical distribution, infact the mean value is almost equal to the median value, similar to what is possible to see for Beaumont. Instead Bryan-College Station has very different value of mean and median and so is very likely to have outliers on the right hand side of the distribution that increase the average of sales.
The median price of Tyler tells us that Tyler despite has the maximum sales value does not have the highest median price and, instead has the third max value of median price.
The data can also be studied grouping by Year or month to understand historical distribution of data for Position indexes
summary_year = df %>%
group_by(year) %>%
summarise(
Sales_mean = round(mean(sales), 2), Sales_median = median(sales), Sales_max = max(sales), Sales_min = min(sales), Total_Sales = sum(sales))
summary_year
## # A tibble: 5 × 6
## year Sales_mean Sales_median Sales_max Sales_min Total_Sales
## <fct> <dbl> <dbl> <int> <int> <int>
## 1 2010 169. 162 316 83 8096
## 2 2011 164. 144. 313 79 7878
## 3 2012 186. 171 322 90 8935
## 4 2013 212. 194. 402 79 10172
## 5 2014 231. 215 423 89 11069
From 2010 and 2014 Sales increase year by year with the only drop in 2011. Average and Median values tells us that the distribution has always outliers in the highest side of the curve, infact the mean value is greater than the median.
summary_month = df %>%
group_by(month) %>%
summarise(
Sales_mean = round(mean(sales), 2), Sales_median = median(sales), Sales_max = max(sales), Sales_min = min(sales), Total_Sales = sum(sales))
summary_month
## # A tibble: 12 × 6
## month Sales_mean Sales_median Sales_max Sales_min Total_Sales
## <int> <dbl> <dbl> <int> <int> <int>
## 1 1 127. 112. 238 79 2548
## 2 2 141. 124. 244 79 2817
## 3 3 189. 176. 298 102 3789
## 4 4 212. 199 323 111 4234
## 5 5 239. 246 388 102 4777
## 6 6 244. 258 423 111 4871
## 7 7 236. 209 403 104 4715
## 8 8 231. 228 357 123 4629
## 9 9 182. 166. 361 95 3647
## 10 10 180. 164. 369 97 3598
## 11 11 157. 157 300 93 3137
## 12 12 169. 156. 332 81 3388
During the year we can assert that the distribution of the sales has a peak in June with a slow start in the year and a little bump in the sales in December. The only outlier of this behavior is the Median and the minimum value of August.
Below is possible to see a visualization of the box plot by city, so is possible to understand how the Sales is distributed:
g1 = ggplot(df)+
geom_boxplot(aes(x=sales, y=city, fill = city), show.legend = FALSE)+
labs(x = "Year", y = "", title = "Box plot of Sales by City")+
theme_classic()+
theme(axis.text.y = element_text(
angle = 60,
hjust = 1,
vjust = -1), plot.title = element_text(hjust = 0.5))
g2 = ggplot(df)+
geom_boxplot(aes(x=sales, y=year, fill = year))+
labs(x = "Year", y = "", title = "Box plot of Sales by City")+
theme_classic()+theme(plot.title = element_text(hjust = 0.5))
g1 | g2
From the box plot is possible to understand that Bryan-College Station has the most variable data for Sales and Wichita Falls is the most stable. From this historical trend year by year the range became more wide and also the interquartile range became larger and larger. This beahvior can be related to the fact that the company became more know, with more sales and more house to sell, so the house pool became more random and more similar to a normal distribution. Also from the plot we can see no outliers.
Below a numerical overview of the boxplot:
summary_var = df %>%
group_by(city) %>%
summarise(
Sales_mean = round(mean(sales), 2), Sales_std = round(sd(sales),2), Sales_var = round(var(sales),2), Sales_iqr = IQR(sales))
summary_var
## # A tibble: 4 × 5
## city Sales_mean Sales_std Sales_var Sales_iqr
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Beaumont 177. 41.5 1721. 52
## 2 Bryan-College Station 206. 85.0 7222. 148.
## 3 Tyler 270. 62.0 3840. 86.8
## 4 Wichita Falls 116. 22.2 491. 33
summary_var_year = df %>%
group_by(year) %>%
summarise(
Sales_mean = round(mean(sales), 2), Sales_std = round(sd(sales),2), Sales_var = round(var(sales),2), Sales_iqr = IQR(sales))
summary_var_year
## # A tibble: 5 × 5
## year Sales_mean Sales_std Sales_var Sales_iqr
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 2010 169. 60.5 3665. 81.2
## 2 2011 164. 63.9 4079. 86.8
## 3 2012 186. 70.9 5028. 114
## 4 2013 212. 84 7055. 124.
## 5 2014 231. 95.5 9123. 151.
g1 = ggplot(df)+
geom_boxplot(aes(x=median_price, y=city, fill = city), show.legend = F)+
labs(x = "Year", y = "", title = "Box plot of Median Price by Year")+
theme_classic()+
theme(axis.text.y = element_text(
angle = 60,
hjust = 1,
vjust = -1), plot.title = element_text(hjust = 0.5))
g2 = ggplot(df)+
geom_boxplot(aes(x=median_price, y=year, fill = year))+
labs(x = "Year", y = "", title = "Box plot of Median Price by Year")+
theme_classic()+theme(plot.title = element_text(hjust = 0.5))
g1 | g2
We can see that the median price is very stable in the city and in the years. Also it’s possible to see some outliers in the cities analysis.
Below a numerical overview of the boxplot:
summary_var_Median_price = df %>%
group_by(city) %>%
summarise(
Median_price_mean = round(mean(median_price), 2), Median_price_std = round(sd(median_price),2), Median_price_var = round(var(median_price),2), Median_price_iqr = IQR(median_price))
summary_var_Median_price
## # A tibble: 4 × 5
## city Median_price_mean Median_price_std Median_price_var Median_price_iqr
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Beaumont 129988. 10105. 102110879. 11525
## 2 Bryan-Co… 157488. 8852. 78362065. 11175
## 3 Tyler 141442. 9337. 87170946. 13700
## 4 Wichita … 101743. 11320. 128143175. 16375
summary_var_Median_price = df %>%
group_by(year) %>%
summarise(
Median_price_mean = round(mean(median_price), 2), Median_price_std = round(sd(median_price),2), Median_price_var = round(var(median_price),2), Median_price_iqr = IQR(median_price))
summary_var_Median_price
## # A tibble: 5 × 5
## year Median_price_mean Median_price_std Median_price_var Median_price_iqr
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 2010 130192. 21822. 476189291. 28075
## 2 2011 127854. 21318. 454448493. 33975
## 3 2012 130077. 21432. 459309889. 30925
## 4 2013 135723. 21708. 471240527. 34350
## 5 2014 139481. 25625. 656661556. 35075
vline = data.frame(
value = c(mean(df$sales), median(df$sales), as.integer(names(which.max(table(df$sales))))),
name = c("Mean", "Median", "Mode")
)
ggplot(df)+
geom_bar(aes(sales), width = 2.5, fill = "#03c03c")+
geom_vline(
data = vline, aes(xintercept = value, color = name), linewidth = 0.6, linetype = "dashed")+
scale_color_manual(values = c("Mean" = "red", "Median" = "blue", "Mode" = "black"))+
annotate("label", label = paste("Fischer value =", round(skewness(df$sales), 2)), x = Inf, y = Inf, fill = "white", vjust = 1, hjust = 1.04)+
annotate("label", label = paste("Kurtosis value =", round(kurtosis(df$sales), 2)), x = Inf, y = Inf, fill = "white", vjust = 2.1, hjust = 1)+
labs(title = "Sales Distribution", y = "Count", x = "Sales", color = "Reference values")+
theme_classic()+theme(plot.title = element_text(hjust = 0.5))
Sales has a positive Fischer Index, infact it’s possible to see from the image above the distribution is shifted to the left and has mean greater than median and mode value.
vline = data.frame(
value = c(mean(df$months_inventory), median(df$months_inventory), as.integer(names(which.max(table(df$months_inventory))))),
name = c("Mean", "Median", "Mode")
)
ggplot(df)+
geom_bar(aes(months_inventory), fill = "#03c03c")+
geom_vline(
data = vline, aes(xintercept = value, color = name), linewidth = 0.8, linetype = "dashed")+
scale_color_manual(values = c("Mean" = "red", "Median" = "blue", "Mode" = "black"))+
annotate("label", label = paste("Fischer value =", round(skewness(df$months_inventory), 2)), x = Inf, y = Inf, fill = "white", vjust = 1, hjust = 1.04)+
annotate("label", label = paste("Kurtosis value =", round(kurtosis(df$months_inventory), 2)), x = Inf, y = Inf, fill = "white", vjust = 2.1, hjust = 1)+
labs(title = "Listings Distribution", y = "Count", x = "Listings", color = "Reference values")+
theme_classic()+theme(plot.title = element_text(hjust = 0.5))
Listings has a slightly positive Fischer Index, infact it’s possible to see from the image above the distribution is barely shifted to the left and has mean greater than median and mode value by 1 and the distribution is almost similar to a normal distribution, infact if the bin are bigger and so the distribution has less details it’s very similar to the normal distribution, below the example.
ggplot(df)+
geom_bar(aes(months_inventory), width = 2, fill = "#03c03c")+
geom_vline(
data = vline, aes(xintercept = value, color = name), linewidth = 0.8, linetype = "dashed")+
scale_color_manual(values = c("Mean" = "red", "Median" = "blue", "Mode" = "black"))+
annotate("label", label = paste("Fischer value =", round(skewness(df$months_inventory), 2)), x = Inf, y = Inf, fill = "white", vjust = 1, hjust = 1.04)+
annotate("label", label = paste("Kurtosis value =", round(kurtosis(df$months_inventory), 2)), x = Inf, y = Inf, fill = "white", vjust = 2.1, hjust = 1)+
labs(title = "Listings Distribution", y = "Count", x = "Listings", color = "Reference values")+
theme_classic()+theme(plot.title = element_text(hjust = 0.5))
For the other variable the graphic of distribution is not helpfull because the value appear 1 or 2 two in the entire dataset so is easier to understand the simmetry by just a matrix of the Fischer indexes value.
The Kurtosis index tells another story, infact is almost 3 for both of the variable (Leptocurtica), so the peaks of the distributions are higher than the tails related to a normal distribution.
form_mat = matrix(c(round(skewness(df$volume), 2), round(kurtosis(df$volume), 2), round(skewness(df$median_price),2),round(kurtosis(df$median_price),2), round(skewness(df$listings), 2), round(kurtosis(df$listings), 2)),nrow = 2, ncol = 3)
colnames(form_mat) = c("Volume", "Median Price", "Listings")
rownames(form_mat) = c("Fischer Index", "Kurtosis Index")
form_df = as.data.frame(form_mat)
form_df
## Volume Median Price Listings
## Fischer Index 0.88 -0.36 0.65
## Kurtosis Index 3.18 2.38 2.21
Volume and Listings has a positive index, so the distribution is shifted to the left, with a mean value greater then the mode and the median value. Median Price has a negative index, so the distribution is shifted to the right, with a mean value smaller then the mode and the median value.
The three variable has the same behavior of the two above speaking about the Kurtosis Index. They have positive index, so they are also Leptocurtica
Let’s visualize our historical trend for Sales:
g1 = ggplot(df)+
geom_col(aes(x = year, y = sales, fill = year), show.legend = FALSE)+
labs(x = "Year", y = "Sales", title = "Sales by year")+
theme_classic()+theme(plot.title = element_text(hjust = 0.5))
name_month = c("January","February","March","April","May","June","July","August","September","October","November","December")
g2 = ggplot(df)+
geom_col(aes(x = month, y = sales, fill = factor(month, levels = 1:12, labels = name_month)), show.legend = FALSE)+
scale_x_discrete(limits = name_month, labels = name_month)+
labs(title = "Sales by month", x = "Month", y = "Sales")+
theme_classic()+
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
vjust = 1), plot.title = element_text(hjust = 0.5))
g1 | g2
From the image above it’s possible to see the same trends analize before. Sales increase from 2010 to 2014 with the only exception of 2011. The distribution in the year has June as the best month and January as the wrost, with also a low goign towards December. So if we imagine to plot the trend in a continuous way we can see the historical trend similiar to a module of the sin with the zeros in January. Let’s visualize the entire trend to see if this assertion is correct:
ggplot(df)+
geom_col(aes(x = year, y = sales, fill = factor(month, levels = 1:12, labels = name_month)), position = "dodge")+
labs(title = "Sales by month divided by year", x = "Year", y = "Sales", fill = "Month")+
theme_classic()+theme(plot.title = element_text(hjust = 0.5))
The assertion about the trend of the historical data is almost true, infact is very similiar to a module of the sin, but every year the zero (January) is traslated slightly up, with the usual exception of 2011,that confirm the historical data visualize before.
Listings class: to analyze the efficiency of the listings we can divide in 4 classes:
df$list_class <- cut(
df$listings,
breaks = c(sort(df$listings)[1], sort(df$listings)[length(df$listings)/4], sort(df$listings)[length(df$listings)/2], sort(df$listings)[(3 * length(df$listings))/4], sort(df$listings)[length(df$listings)]),
labels = c("Few listings", "Low interquartile", "High interquartile", "Many listings"),
include.lowest = TRUE)
summary_listing_sales = df %>%
group_by(list_class) %>%
summarise(
Mean_sales = round(mean(sales), 2), Median_sales = round(median(sales),2), Sum_sales = sum(sales))
g1 = ggplot(summary_listing_sales)+
geom_col(aes(x = list_class, y = Mean_sales, fill = list_class), show.legend = FALSE)+
labs(title = "Mean of Sales by class", x = NULL, y = NULL)+
theme_classic()+
theme(axis.text.x = element_text(
angle = 15,
hjust = 1,
vjust = 1), plot.title = element_text(hjust = 0.5))
g2 = ggplot(summary_listing_sales)+
geom_col(aes(x = list_class, y = Median_sales, fill = list_class), show.legend = FALSE)+
labs(title = "Median of Sales by class", x = NULL, y = NULL)+
theme_classic()+
theme(axis.text.x = element_text(
angle = 15,
hjust = 1,
vjust = 1), plot.title = element_text(hjust = 0.5))
g3 = ggplot(summary_listing_sales)+
geom_col(aes(x = list_class, y = Sum_sales, fill = list_class), show.legend = FALSE)+
labs(title = "Sum of Sales by List class", x = NULL, y = NULL)+
theme_classic()+
theme(plot.title = element_text(hjust = 0.5))
(g1 | g2)
g3
From the two images above is possible to see that within the Interquartile range the sales in terms of mean, median and sum are almost the same. Above Q3 the mean and the median values almost double and the total of sales increase by 5000. Below Q1 the sales drops drastically, with the same proportion. This means that within the interquartile range the number of listings does not affect the sales. Above Q3 is almost like a breaking point where number of listings increase the sales by 50%.
This new founded beahvior describe a relation between number of listings and sales. So is possible to analyze if it’s more similiar to a linear or an exponential relation where more number of listings corresponds more sales, or if something different where is possible to found a plateau and so a maximum efficiency of the listings.
Below can be found an analysis with only the value above Q3:
df_listings_Q3 = subset(df, df$list_class == "Many listings")
df_listings_Q3$new_list_class <- cut(
df_listings_Q3$listings,
breaks = c(sort(df_listings_Q3$listings)[1], sort(df_listings_Q3$listings)[length(df_listings_Q3$listings)/4], sort(df_listings_Q3$listings)[length(df_listings_Q3$listings)/2], sort(df_listings_Q3$listings)[(3 * length(df_listings_Q3$listings))/4], sort(df_listings_Q3$listings)[length(df_listings_Q3$listings)]),
labels = c("MIN-Q1", "Q1-Q2", "Q2-Q3", "Q3-MAX"),
include.lowest = TRUE)
summary_listing_sales_NEW = df_listings_Q3 %>%
group_by(new_list_class) %>%
summarise(
Mean_sales = round(mean(sales), 2), Median_sales = round(median(sales),2), Sum_sales = sum(sales))
g1 = ggplot(summary_listing_sales_NEW)+
geom_col(aes(x = new_list_class, y = Mean_sales, fill = new_list_class), show.legend = FALSE)+
labs(title = "Mean of Sales by class", x = NULL, y = NULL)+
theme_classic()+
theme(axis.text.x = element_text(
angle = 15,
hjust = 1,
vjust = 1), plot.title = element_text(hjust = 0.5))
g2 = ggplot(summary_listing_sales_NEW)+
geom_col(aes(x = new_list_class, y = Median_sales, fill = new_list_class), show.legend = FALSE)+
labs(title = "Median of Sales by class", x = NULL, y = NULL)+
theme_classic()+
theme(axis.text.x = element_text(
angle = 15,
hjust = 1,
vjust = 1), plot.title = element_text(hjust = 0.5))
g3 = ggplot(summary_listing_sales_NEW)+
geom_col(aes(x = new_list_class, y = Sum_sales, fill = new_list_class), show.legend = FALSE)+
labs(title = "Sum of Sales by List class", x = NULL, y = NULL)+
theme_classic()+
theme(plot.title = element_text(hjust = 0.5))
g1 | g2
g3
The relation between number of listings and sales is very complex infact is possible to see that between Q2 and Q3 of the number of listing above Q3 in the general data the efficiency of the listing are at it’s peak, and above that the sales slightly decrease. So it’s very likely to be similiar to a decay function, with an exponential start and a decay factor that define a peak and then an exponential (or similar) decrease.
The efficiency of the listings is not defined only by sales, but also by the time needed to sell all of the listings.
summary_listing_months_inventory = df %>%
group_by(list_class) %>%
summarise(
Mean_months_inventory = round(mean(months_inventory), 2), Median_months_inventory = round(median(months_inventory),2))
g1 = ggplot(summary_listing_months_inventory)+
geom_col(aes(x = list_class, y = Mean_months_inventory, fill = list_class), show.legend = FALSE)+
labs(title = "Mean of months in inventory by class", x = NULL, y = NULL)+
theme_classic()+
theme(axis.text.x = element_text(
angle = 15,
hjust = 1,
vjust = 1), plot.title = element_text(hjust = 0.5))
g2 = ggplot(summary_listing_months_inventory)+
geom_col(aes(x = list_class, y = Median_months_inventory, fill = list_class), show.legend = FALSE)+
labs(title = "Median of months in inventory by class", x = NULL, y = NULL)+
theme_classic()+
theme(axis.text.x = element_text(
angle = 15,
hjust = 1,
vjust = 1), plot.title = element_text(hjust = 0.5))
(g1 | g2)
As expected more listings means more time to sell.
Listings’ efficiency: Taking into account the time period needed to sell is possible to define a new metric. Dividing the sum of sales by the time period is possible to find the total sales done in one month of inventory.
efficiency_summary = df %>%
group_by(list_class) %>%
summarise(
Sales_Efficiency = sum(sales)/sum(months_inventory), Volume_Efficiency = sum(volume)/sum(months_inventory)
)
g1 = ggplot(efficiency_summary)+
geom_col(aes(x = list_class, y = Sales_Efficiency, fill = list_class), show.legend = FALSE)+
geom_text(aes(x = list_class, y = Sales_Efficiency, label = round(Sales_Efficiency, 2)), vjust = -0.5)+
labs(title = "Number of Sales by month in inventory", x = NULL, y = NULL)+
theme_classic()+
theme(axis.text.x = element_text(
angle = 15,
hjust = 1,
vjust = 1), plot.title = element_text(hjust = 0.5))
g1
The new metric reveals different plot. The most efficiency is where the listing are below median and are within the interquartile range. A similar efficiency is above Q3 as explored before, so the distribution of the efficiency is more likely to be a sin, where more listings means more time until a breaking point where the number of listings reach more people that buys. So is going to be very interesting to analyze where this listings are posted to understand if the number matter more than the where.
The same pattern can be seen with the volume:
g1 = ggplot(efficiency_summary)+
geom_col(aes(x = list_class, y = Volume_Efficiency, fill = list_class), show.legend = FALSE)+
geom_text(aes(x = list_class, y = Volume_Efficiency, label = round(Volume_Efficiency, 3)), vjust = -0.5)+
labs(title = "Number of Sales by month in inventory", x = NULL, y = NULL)+
theme_classic()+
theme(axis.text.x = element_text(
angle = 15,
hjust = 1,
vjust = 1), plot.title = element_text(hjust = 0.5))
g1
Last note about this behavior is that probably above Q3 the efficiency is smaller than the previous peak because it takes more time to sell. So with the current approach the right investment is less house with more market value.
gini.index = function(x) {
n_i = table(x)
f_i_2 = (table(x)/length(x)) ^ 2
j = length(table(x))
gini = 1 - sum(f_i_2)
gini_norm = gini/((j - 1)/j)
return(gini_norm)}
df$vol_class <- cut(
df$volume,
breaks = c(0, 15, 30, 50, Inf),
labels = c("Critical income", "Low income", "Sustainable income", "High income"),
include.lowest = TRUE)
vol_class_table_simple = table(df$vol_class)
vol_class_table_simple
##
## Critical income Low income Sustainable income High income
## 41 93 70 36
vol_class_table = table(df$vol_class, df$year)
sum_by_year = c(sum(table(df$vol_class, df$year)[1:4]), sum(table(df$vol_class, df$year)[5:8]), sum(table(df$vol_class, df$year)[9:12]), sum(table(df$vol_class, df$year)[13:16]), sum(table(df$vol_class, df$year)[17:20]))
vol_class_table = rbind(vol_class_table,sum_by_year)
sum = rowSums(vol_class_table)
vol_class_table = cbind(vol_class_table, sum)
colnames(vol_class_table) = c("2010", "2011", "2012", "2013", "2014", "Sum by volume class")
vol_class_table
## 2010 2011 2012 2013 2014 Sum by volume class
## Critical income 7 11 10 6 7 41
## Low income 26 23 19 15 10 93
## Sustainable income 15 13 11 15 16 70
## High income 0 1 8 12 15 36
## sum_by_year 48 48 48 48 48 240
gini = gini.index(df$vol_class)
print(paste("Gini index for the volume in the class defined is", round(gini,2), "so the distribution is very heterogeneous. Once more a validation of the simmetric distribution of the data."))
## [1] "Gini index for the volume in the class defined is 0.95 so the distribution is very heterogeneous. Once more a validation of the simmetric distribution of the data."
The dataset is distributed evenly between City and year. So the probability of extracting one row with same City and/or Year are all the same. This also led us to time analysis without worry about having different sensibility and number of data between city or year.
This simmetry can also seen in “Sales” and “Month in inventory” thank to the positional index analysis (mean and median are very similar), instead the most asymetrical variable is “Listings” and “Volume” with a distribution shifted to the right (mean bigger than the median). Opposite to that is “Median Price” with a shift on the left side, with a median greater than the mean.
Grouping the variable by city is also possible to see that Wichita Falls has the lowest sales and Tyler the highest. Tyler has also a simmetrical distribution, as well as Beaumont. Bryan-College Station has a shift on the right side of the distribution with more outliers on the right side.
The median price of Tyler is at third place. Bryan-College Station win with a median price at first place (best maximum, mean and minimum values).
Analyzing the time period from 2010 and 2014 Sales increase year by year with the only drop in 2011. Average and Median values tells us that the distribution has always outliers in the highest side of the curve, infact the mean value is greater than the median. During the year period we can assert that the distribution of the sales has a peak in June with a slow start in the year and a little bump in the sales in December. The only outlier of this behavior is the Median and the minimum value of August.
Bryan-College Station has the most variable data for Sales and Wichita Falls is the most stable.The median price is very stable in the city and in the different years.
Sales has a positive Fischer Index, so, as expected, the distribution is shifted to the left and has mean greater than median and mode value, as well as Volume. Listings has a slightly positive Fischer Index, very similar to a normal distibution slightly shifted to the left. Median Price has a negative index, so the distribution is shifted to the right.
The Kurtosis index is positive for all of the variable (Leptocurtica), so the peaks of the distributions are higher than the tails related to a normal distribution
Sales increase from 2010 to 2014 with the only exception of 2011. The distribution in the year has June as the best month and January as the wrost, with also a low goign towards December. The historical distribution of sales is very similiar to a module of the sin, but every year the zero (January) is traslated slightly up, with the usual exception of 2011, that confirm the historical data visualize before.
Using number of listing to define the classes is possible to see that above Q3 the mean and the median values of sales almost double and the total of sales increase by 5000. Below Q1 the sales drops drastically, with the same proportion. This means that within the interquartile range the number of listings does not affect the sales. Above Q3 is almost like a breaking point where number of listings increase the sales by 50%. To deep dive this behavior and found out if the trend continue like an exponential function is possible to do the same analysis but with only the value above Q3. The relation between number of listings and sales is very complex infact is possible to see that between Q2 and Q3 of the number of listing above Q3 in the general data the efficiency of the listing are at it’s peak, and above that the sales slightly decrease. So it’s very likely to be similiar to a decay function, with an exponential start and a decay factor that define a peak and then an exponential (or similar) decrease.
The efficiency of the listings is not defined only by sales, but also by the time needed to sell all of the listings. Taking into account the time period needed to sell is possible to define a new metric. Dividing the sum of sales by the time period is possible to find the total sales done in one month of inventory.
The new metric reveals different plot: the most efficiency is where the listing are below median and are within the interquartile range, a similar efficiency is above Q3 as explored before. So more listings means more time until a breaking point where the number of listings reach more people that buys. The same pattern can be seen with the volume.
Gini index for the volume class defined is 0.95 so the distribution is very heterogeneous. Once more a validation of the simmetric distribution of the data.
from this exploratory analysis there is different way for Texas Realty Insights to expand the company volume.