Texas Real Estate Market Analysis

This study initially involves a more general analysis of the Texas region, and then focuses on a more specific analysis for each city.

The dataset contains 8 variables:

city = qualitative variable on a nominal scale

year = quantitative variable on an interval scale

month = qualitative variable on an ordinal scale

sales= discrete quantitative variable

volume = continue quantitative variable

median_price = continue quantitative variable

listings = discrete quantitative variable

months_inventory = continue quantitative variable

options(repos = c(CRAN = "https://cloud.r-project.org"))
#install.packages('rsconnect')
#install.packages("ggplot2")
#install.packages("dplyr")
#install.packages("moments")

data = read.csv("realestate_texas.csv", sep=",")
dim(data)

## [1] 240   8

N=dim(data)[1]

# "city" variable

table(data["city"])

## city
##              Beaumont Bryan-College Station                 Tyler 
##                    60                    60                    60 
##         Wichita Falls 
##                    60

abs_freq=table(data$city)
rel_freq=table(data$city)/N
distr_freq_city=cbind(abs_freq, rel_freq)
distr_freq_city

##                       abs_freq rel_freq
## Beaumont                    60     0.25
## Bryan-College Station       60     0.25
## Tyler                       60     0.25
## Wichita Falls               60     0.25

# "year" variable

table(data$year)

## 
## 2010 2011 2012 2013 2014 
##   48   48   48   48   48

# "month" variable

table(data$month)

## 
##  1  2  3  4  5  6  7  8  9 10 11 12 
## 20 20 20 20 20 20 20 20 20 20 20 20

attach(data)

# "sales" variable

min(sales)

## [1] 79

max(sales)

## [1] 423

median(sales)

## [1] 175.5

mean(sales)

## [1] 192.2917

quantile(sales)

##    0%   25%   50%   75%  100% 
##  79.0 127.0 175.5 247.0 423.0

range(sales)

## [1]  79 423

IQR(sales)

## [1] 120

var(sales)

## [1] 6344.3

sd(sales)

## [1] 79.65111

cv=function(x){
  return(sd(x)/mean(x)*100)
}
cv(sales)

## [1] 41.42203

library(moments)
skewness(sales)

## [1] 0.718104

kurtosis(sales)-3

## [1] -0.3131764

# "volume" variable

min(volume)

## [1] 8.166

max(volume)

## [1] 83.547

median(volume)

## [1] 27.0625

mean(volume)

## [1] 31.00519

quantile(volume)

##      0%     25%     50%     75%    100% 
##  8.1660 17.6595 27.0625 40.8930 83.5470

range(volume)

## [1]  8.166 83.547

IQR(volume)

## [1] 23.2335

var(volume)

## [1] 277.2707

sd(volume)

## [1] 16.65145

cv(volume)

## [1] 53.70536

skewness(volume)

## [1] 0.884742

kurtosis(volume)-3

## [1] 0.176987

# "listings" variable

min(listings)

## [1] 743

max(listings)

## [1] 3296

median(listings)

## [1] 1618.5

mean(listings)

## [1] 1738.021

quantile(listings)

##     0%    25%    50%    75%   100% 
##  743.0 1026.5 1618.5 2056.0 3296.0

range(listings)

## [1]  743 3296

IQR(listings)

## [1] 1029.5

var(listings)

## [1] 566569

sd(listings)

## [1] 752.7078

cv(listings)

## [1] 43.30833

skewness(listings)

## [1] 0.6494982

kurtosis(listings)-3

## [1] -0.79179

I calculated absolute and relative frequency for the qualitative variable “city” and I found that there are 4 cities in question and they all appear with the same frequency. Furthermore, for each city, data were recorded for each month of the year for 5 years (from 2010 to 2014).

Then, I calculated the position, variability and shape indices for the following variables: sales, volume and listings. The variable with the highest variability is “volume”, as it has the highest coefficient of variation. The most asymmetric variable is “volume”, since its asymmetry index is the one that is furthest from zero.

# creating classes for the "sales" variable

dim(data)

## [1] 240   8

N=dim(data)[1]
min(data$sales)

## [1] 79

max(data$sales)

## [1] 423

data$sales_cl=cut(data$sales, seq(70, 430, 60))

ni=table(data$sales_cl)
fi=ni/N
Ni=cumsum(ni)
Fi=Ni/N

cbind(ni, fi, Ni, Fi)

##           ni         fi  Ni        Fi
## (70,130]  67 0.27916667  67 0.2791667
## (130,190] 72 0.30000000 139 0.5791667
## (190,250] 42 0.17500000 181 0.7541667
## (250,310] 36 0.15000000 217 0.9041667
## (310,370] 17 0.07083333 234 0.9750000
## (370,430]  6 0.02500000 240 1.0000000

distr_freq_sales_cl=as.data.frame(cbind(ni, fi, Ni, Fi))

library(ggplot2)

ggplot(data=data)+
  geom_bar(aes(x=sales_cl),
           stat="count",
           col="black",
           fill="blue")+
  labs(title="Sales classes distribution",
       x="sales classes",
       y="absolute frequencies")+
  scale_y_continuous(breaks=seq(0,80,10))

# Gini index calculation

gini.index=function(x){
  ni=table(x)
  fi=ni/length(x)
  fi2=fi^2
  J=length(table(x))
  
  gini=1-sum(fi2)
  gini.normalized=gini/((J-1)/J)
  return(gini.normalized)
}

gini.index(distr_freq_sales_cl)

## [1] 0.6254826

I selected the variable sales and divided it into 6 classes of step 60. I created a frequency distribution and I represented the absolute frequencies with a bar graph.

From the graph we can see that the most frequent class is 130-190, while the least frequent is 370-430. Therefore, in Texas, during the 5 years, a number of sales per month between 130 and 190 was recorded most frequently.

Then, I calculated the Gini heterogeneity index, obtaining a value equal to 0.625, demonstrating that there is a certain heterogeneity of the distribution of values in the six classes.

# PROBABILITY CALCULATION

distr_freq_city

##                       abs_freq rel_freq
## Beaumont                    60     0.25
## Bryan-College Station       60     0.25
## Tyler                       60     0.25
## Wichita Falls               60     0.25

# The city "Beaumont" has a 25% chance of appearing in a randomly chosen row.

abs_freq_month=table(data$month)
rel_freq_month=abs_freq_month/N
distr_freq_month=cbind(abs_freq_month, rel_freq_month)
distr_freq_month

##    abs_freq_month rel_freq_month
## 1              20     0.08333333
## 2              20     0.08333333
## 3              20     0.08333333
## 4              20     0.08333333
## 5              20     0.08333333
## 6              20     0.08333333
## 7              20     0.08333333
## 8              20     0.08333333
## 9              20     0.08333333
## 10             20     0.08333333
## 11             20     0.08333333
## 12             20     0.08333333

# The month "Luglio" has about an 8% chance of appearing in a randomly chosen row.

table(data$month, data$year)

##     
##      2010 2011 2012 2013 2014
##   1     4    4    4    4    4
##   2     4    4    4    4    4
##   3     4    4    4    4    4
##   4     4    4    4    4    4
##   5     4    4    4    4    4
##   6     4    4    4    4    4
##   7     4    4    4    4    4
##   8     4    4    4    4    4
##   9     4    4    4    4    4
##   10    4    4    4    4    4
##   11    4    4    4    4    4
##   12    4    4    4    4    4

table(data$month, data$year)/N

##     
##            2010       2011       2012       2013       2014
##   1  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   2  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   3  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   4  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   5  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   6  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   7  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   8  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   9  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   10 0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   11 0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   12 0.01666667 0.01666667 0.01666667 0.01666667 0.01666667

# "Dicembre 2012" has about a 1.7% chance of appearing in a randomly chosen row.

I performed the probability calculations according to the classical meaning, i.e. number of favorable cases / number of possible cases, which corresponds to the relative frequency.

attach(data)

## I seguenti oggetti sono mascherati da data (pos = 5):
## 
##     city, listings, median_price, month, months_inventory, sales,
##     volume, year

# creation of the "average_price" variable

average_price=c((volume/sales)*1000000)
data$average_price=average_price

# creation of the "efficiency_index" variable

listings_per_month=c(listings/months_inventory)
range(listings_per_month)

## [1] 106.8354 329.2754

index=function(x){
  ifelse(x<=180,
    "1",
    ifelse(x<250,
      "2",
      "3"))
}

index(listings_per_month)

##   [1] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
##  [19] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
##  [37] "1" "1" "1" "1" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2"
##  [55] "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "1" "1" "1" "1" "1"
##  [73] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
##  [91] "1" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2"
## [109] "2" "2" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "2" "2" "2" "3" "3" "3"
## [127] "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2"
## [145] "2" "2" "2" "2" "2" "2" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3"
## [163] "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3"
## [181] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
## [199] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
## [217] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
## [235] "1" "1" "1" "1" "1" "1"

data$efficiency_index=index(listings_per_month)

abs_freq=table(data$efficiency_index, data$city)
rel_freq=abs_freq/N
distr_freq_efficiency_index=cbind(abs_freq, rel_freq)
distr_freq_efficiency_index

##   Beaumont Bryan-College Station Tyler Wichita Falls   Beaumont
## 1       40                    24     0            60 0.16666667
## 2       20                    26    27             0 0.08333333
## 3        0                    10    33             0 0.00000000
##   Bryan-College Station  Tyler Wichita Falls
## 1            0.10000000 0.0000          0.25
## 2            0.10833333 0.1125          0.00
## 3            0.04166667 0.1375          0.00

I used an index ranging from 1 to 3 to indicate the efficiency of the cities’ marketing strategy.

The city with the best market strategy over the 5 years is Tyler, while Wichita Falls is the worst as there is no growth over the years. As regards Beaumont, an improvement has been noted since 2013. Bryan-College Station appears to be the city that has seen the greatest growth over the years (starting from an efficiency index of 1, it reached an index of 3).

library(ggplot2)

# boxplot for the median sales price distribution

ggplot(data=data)+
  geom_boxplot(aes(x=city,
                   y=median_price),
               fill="lightblue")+
  labs(x="City",
       y="Median Price")+
  ggtitle("Median price distribution")+
  theme(plot.title = element_text(hjust = 0.5, size=14, face="bold"))

The boxplot shows the distribution of median sales prices for each city.

Bryan-College Station is the city with the highest sales prices, while Wichita Falls is the city with the lowest prices. There are no major differences in the level of variability and symmetry of the data distributions between cities. We can note, however, that Beaumont, Bryan-College Station, and Wichita Falls have some outliers.

#monthly sales distribution

library(dplyr)

## 
## Caricamento pacchetto: 'dplyr'

## I seguenti oggetti sono mascherati da 'package:stats':
## 
##     filter, lag

## I seguenti oggetti sono mascherati da 'package:base':
## 
##     intersect, setdiff, setequal, union

sales_per_month=data %>%
  group_by(city, month) %>%
  summarise(total_sales=sum(sales))

## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.

library(ggplot2)
ggplot(data=sales_per_month)+
  geom_col(aes(x=month,
               y=total_sales,
               fill=city),
           position="stack",
           col="black")+
  labs(title="Monthly Sales Distribution",
       x="Months",
       y="Absolute frequencies")+
  scale_x_continuous(breaks=seq(1,12,1))+
  theme_classic()

The graph shows a monthly distribution of sales summed for the 5 years.

It is noted that the city that recorded the least sales overall is Wichita Falls, followed by Beaumont, Bryan-College Station and finally Tyler. The months in which the most sales were recorded overall are the summer months, from May to August, with a peak in June. Instead, the months with the least sales are January and February.

#normalized monthly sales distribution

N=sum(sales_per_month$total_sales)
rel_freq=(sales_per_month$total_sales)/N

sales_per_month$rel_freq=rel_freq

library(ggplot2)
ggplot(data=sales_per_month)+
  geom_col(aes(x=month,
               y=rel_freq,
               fill=city),
           position="fill",
           col="black")+
  labs(title="Relative Monthly Sales Distribution",
       x="Months",
       y="Relative frequencies")+
  scale_x_continuous(breaks=seq(1,12,1))+
  scale_y_continuous(breaks=seq(0,1,0.1))

The normalized bar chart confirms the trend of individual cities.

# annual sales distribution

library(dplyr)  

sales_per_year=data %>%
  group_by(city, year) %>%
  summarise(total_sales=sum(sales))

## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.

library(ggplot2)
ggplot(data=sales_per_year)+
  geom_col(aes(x=year,
               y=total_sales,
               fill=city),
           position="stack",
           col="black")+
  labs(title="Annual Sales Distribution",
       x="Years",
       y="Absolute frequencies")+
  theme_classic()

The graph shows the sales trend over the 5 years. Over the years, there is an improvement, especially in 2014.

The city that contributed the most to this growth is Tyler, followed by Bryan-College Station, Beaumont and finally Wichita Falls. Furthermore, the latter has a rather static trend.

# Average sales price per year

average_volume=data%>%
  group_by(city, year) %>%
  summarise(average=mean(volume),
            st_dev=sd(volume))

## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.

ggplot(data=average_volume)+
  geom_col(aes(x=year,
               y=average,
               fill=city),
           position="dodge",
           col="black")+
  labs(title="Average sales price per year",
       x="Years",
       y="Average Price (million dollars)")+
  theme_classic()

The graph shows the average sales price by year and city.

The city with the highest average price is Tyler, followed by Bryan-College Station, Beaumont, and Wichita Falls. It is also noted that over the years there has been an increase in the average price for all cities except Wichita Falls.

# boxplot for sales price distribution

ggplot(data=data)+
  geom_boxplot(aes(x=city,
                   y=volume),
               fill="lightblue")+
  labs(x="City",
       y="Sales Price")+
  ggtitle("Sales price distribution")+
  theme(plot.title = element_text(hjust = 0.5, size=14, face="bold"))

The graph confirms Tyler as the city with the highest number of high prices and shows how Bryan-College Station has the most variability in the data distribution.

library(dplyr)
library(ggplot2)

# line chart for annual sales distribution

data$date=as.Date(with(data, paste(year, month, "01", sep="-")), "%Y-%m-%d")
ggplot(data=data)+
  geom_line(aes(x=date, y=sales, color=city), lwd=1)+
  labs(x="Date",
       y="Sales",
       title="Annual sales distribution")+
  scale_x_date(date_breaks = "3 month", date_labels = "%Y-%m")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The graph confirms what was found with the previous graphs. That is, Tyler is the most consistent with high sales, while Wichita Falls is the most consistent with low sales. Bryan-College Station is the most variable. It is interesting to underline how the sales peaks coincide with the spring-summer periods.

CONCLUSIONS

From this statistical analysis regarding the Texas region we can highlight the following conclusions:

over the 5 years examined, the aspect that has undergone the most changes is the value of recorded sales;
overall, more sales were recorded in the summer periods compared to the winter ones;
Tyler is the city that has recorded the most sales over the years and with the highest prices;
Bryan-College Station was the most variable in terms of sales and pricing;
Wichita Falls was the least active, with the fewest sales over the years.

Therefore, we can conclude that Tyler has the best market strategy (also considering the number of active ads and the months_inventory). It is advisable to review the Wichita Falls strategy.

Texas Real Estate Market Analysis

Zoe Cataldi

2024-11-19