This study initially involves a more general analysis of the Texas region, and then focuses on a more specific analysis for each city.

The dataset contains 8 variables:

city = qualitative variable on a nominal scale

year = quantitative variable on an interval scale

month = qualitative variable on an ordinal scale

sales= discrete quantitative variable

volume = continue quantitative variable

median_price = continue quantitative variable

listings = discrete quantitative variable

months_inventory = continue quantitative variable

options(repos = c(CRAN = "https://cloud.r-project.org"))
#install.packages('rsconnect')
#install.packages("ggplot2")
#install.packages("dplyr")
#install.packages("moments")

data = read.csv("realestate_texas.csv", sep=",")
dim(data)
## [1] 240   8
N=dim(data)[1]

# "city" variable

table(data["city"])
## city
##              Beaumont Bryan-College Station                 Tyler 
##                    60                    60                    60 
##         Wichita Falls 
##                    60
abs_freq=table(data$city)
rel_freq=table(data$city)/N
distr_freq_city=cbind(abs_freq, rel_freq)
distr_freq_city
##                       abs_freq rel_freq
## Beaumont                    60     0.25
## Bryan-College Station       60     0.25
## Tyler                       60     0.25
## Wichita Falls               60     0.25
# "year" variable

table(data$year)
## 
## 2010 2011 2012 2013 2014 
##   48   48   48   48   48
# "month" variable

table(data$month)
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 
## 20 20 20 20 20 20 20 20 20 20 20 20
attach(data)

# "sales" variable

min(sales)
## [1] 79
max(sales)
## [1] 423
median(sales)
## [1] 175.5
mean(sales)
## [1] 192.2917
quantile(sales)
##    0%   25%   50%   75%  100% 
##  79.0 127.0 175.5 247.0 423.0
range(sales)
## [1]  79 423
IQR(sales)
## [1] 120
var(sales)
## [1] 6344.3
sd(sales)
## [1] 79.65111
cv=function(x){
  return(sd(x)/mean(x)*100)
}
cv(sales)
## [1] 41.42203
library(moments)
skewness(sales)
## [1] 0.718104
kurtosis(sales)-3
## [1] -0.3131764
# "volume" variable

min(volume)
## [1] 8.166
max(volume)
## [1] 83.547
median(volume)
## [1] 27.0625
mean(volume)
## [1] 31.00519
quantile(volume)
##      0%     25%     50%     75%    100% 
##  8.1660 17.6595 27.0625 40.8930 83.5470
range(volume)
## [1]  8.166 83.547
IQR(volume)
## [1] 23.2335
var(volume)
## [1] 277.2707
sd(volume)
## [1] 16.65145
cv(volume)
## [1] 53.70536
skewness(volume)
## [1] 0.884742
kurtosis(volume)-3
## [1] 0.176987
# "listings" variable

min(listings)
## [1] 743
max(listings)
## [1] 3296
median(listings)
## [1] 1618.5
mean(listings)
## [1] 1738.021
quantile(listings)
##     0%    25%    50%    75%   100% 
##  743.0 1026.5 1618.5 2056.0 3296.0
range(listings)
## [1]  743 3296
IQR(listings)
## [1] 1029.5
var(listings)
## [1] 566569
sd(listings)
## [1] 752.7078
cv(listings)
## [1] 43.30833
skewness(listings)
## [1] 0.6494982
kurtosis(listings)-3
## [1] -0.79179

I calculated absolute and relative frequency for the qualitative variable “city” and I found that there are 4 cities in question and they all appear with the same frequency. Furthermore, for each city, data were recorded for each month of the year for 5 years (from 2010 to 2014).

Then, I calculated the position, variability and shape indices for the following variables: sales, volume and listings. The variable with the highest variability is “volume”, as it has the highest coefficient of variation. The most asymmetric variable is “volume”, since its asymmetry index is the one that is furthest from zero.

# creating classes for the "sales" variable

dim(data)
## [1] 240   8
N=dim(data)[1]
min(data$sales)
## [1] 79
max(data$sales)
## [1] 423
data$sales_cl=cut(data$sales, seq(70, 430, 60))

ni=table(data$sales_cl)
fi=ni/N
Ni=cumsum(ni)
Fi=Ni/N

cbind(ni, fi, Ni, Fi)
##           ni         fi  Ni        Fi
## (70,130]  67 0.27916667  67 0.2791667
## (130,190] 72 0.30000000 139 0.5791667
## (190,250] 42 0.17500000 181 0.7541667
## (250,310] 36 0.15000000 217 0.9041667
## (310,370] 17 0.07083333 234 0.9750000
## (370,430]  6 0.02500000 240 1.0000000
distr_freq_sales_cl=as.data.frame(cbind(ni, fi, Ni, Fi))

library(ggplot2)

ggplot(data=data)+
  geom_bar(aes(x=sales_cl),
           stat="count",
           col="black",
           fill="blue")+
  labs(title="Sales classes distribution",
       x="sales classes",
       y="absolute frequencies")+
  scale_y_continuous(breaks=seq(0,80,10))

# Gini index calculation

gini.index=function(x){
  ni=table(x)
  fi=ni/length(x)
  fi2=fi^2
  J=length(table(x))
  
  gini=1-sum(fi2)
  gini.normalized=gini/((J-1)/J)
  return(gini.normalized)
}

gini.index(distr_freq_sales_cl)
## [1] 0.6254826

I selected the variable sales and divided it into 6 classes of step 60. I created a frequency distribution and I represented the absolute frequencies with a bar graph.

From the graph we can see that the most frequent class is 130-190, while the least frequent is 370-430. Therefore, in Texas, during the 5 years, a number of sales per month between 130 and 190 was recorded most frequently.

Then, I calculated the Gini heterogeneity index, obtaining a value equal to 0.625, demonstrating that there is a certain heterogeneity of the distribution of values in the six classes.

# PROBABILITY CALCULATION

distr_freq_city
##                       abs_freq rel_freq
## Beaumont                    60     0.25
## Bryan-College Station       60     0.25
## Tyler                       60     0.25
## Wichita Falls               60     0.25
# The city "Beaumont" has a 25% chance of appearing in a randomly chosen row.

abs_freq_month=table(data$month)
rel_freq_month=abs_freq_month/N
distr_freq_month=cbind(abs_freq_month, rel_freq_month)
distr_freq_month
##    abs_freq_month rel_freq_month
## 1              20     0.08333333
## 2              20     0.08333333
## 3              20     0.08333333
## 4              20     0.08333333
## 5              20     0.08333333
## 6              20     0.08333333
## 7              20     0.08333333
## 8              20     0.08333333
## 9              20     0.08333333
## 10             20     0.08333333
## 11             20     0.08333333
## 12             20     0.08333333
# The month "Luglio" has about an 8% chance of appearing in a randomly chosen row.

table(data$month, data$year)
##     
##      2010 2011 2012 2013 2014
##   1     4    4    4    4    4
##   2     4    4    4    4    4
##   3     4    4    4    4    4
##   4     4    4    4    4    4
##   5     4    4    4    4    4
##   6     4    4    4    4    4
##   7     4    4    4    4    4
##   8     4    4    4    4    4
##   9     4    4    4    4    4
##   10    4    4    4    4    4
##   11    4    4    4    4    4
##   12    4    4    4    4    4
table(data$month, data$year)/N
##     
##            2010       2011       2012       2013       2014
##   1  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   2  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   3  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   4  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   5  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   6  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   7  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   8  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   9  0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   10 0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   11 0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
##   12 0.01666667 0.01666667 0.01666667 0.01666667 0.01666667
# "Dicembre 2012" has about a 1.7% chance of appearing in a randomly chosen row.

I performed the probability calculations according to the classical meaning, i.e. number of favorable cases / number of possible cases, which corresponds to the relative frequency.

attach(data)
## I seguenti oggetti sono mascherati da data (pos = 5):
## 
##     city, listings, median_price, month, months_inventory, sales,
##     volume, year
# creation of the "average_price" variable

average_price=c((volume/sales)*1000000)
data$average_price=average_price

# creation of the "efficiency_index" variable

listings_per_month=c(listings/months_inventory)
range(listings_per_month)
## [1] 106.8354 329.2754
index=function(x){
  ifelse(x<=180,
    "1",
    ifelse(x<250,
      "2",
      "3"))
}

index(listings_per_month)
##   [1] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
##  [19] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
##  [37] "1" "1" "1" "1" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2"
##  [55] "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "1" "1" "1" "1" "1"
##  [73] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
##  [91] "1" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2"
## [109] "2" "2" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "2" "2" "2" "3" "3" "3"
## [127] "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2" "2"
## [145] "2" "2" "2" "2" "2" "2" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3"
## [163] "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3" "3"
## [181] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
## [199] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
## [217] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1"
## [235] "1" "1" "1" "1" "1" "1"
data$efficiency_index=index(listings_per_month)

abs_freq=table(data$efficiency_index, data$city)
rel_freq=abs_freq/N
distr_freq_efficiency_index=cbind(abs_freq, rel_freq)
distr_freq_efficiency_index
##   Beaumont Bryan-College Station Tyler Wichita Falls   Beaumont
## 1       40                    24     0            60 0.16666667
## 2       20                    26    27             0 0.08333333
## 3        0                    10    33             0 0.00000000
##   Bryan-College Station  Tyler Wichita Falls
## 1            0.10000000 0.0000          0.25
## 2            0.10833333 0.1125          0.00
## 3            0.04166667 0.1375          0.00

I used an index ranging from 1 to 3 to indicate the efficiency of the cities’ marketing strategy.

The city with the best market strategy over the 5 years is Tyler, while Wichita Falls is the worst as there is no growth over the years. As regards Beaumont, an improvement has been noted since 2013. Bryan-College Station appears to be the city that has seen the greatest growth over the years (starting from an efficiency index of 1, it reached an index of 3).

library(ggplot2)

# boxplot for the median sales price distribution

ggplot(data=data)+
  geom_boxplot(aes(x=city,
                   y=median_price),
               fill="lightblue")+
  labs(x="City",
       y="Median Price")+
  ggtitle("Median price distribution")+
  theme(plot.title = element_text(hjust = 0.5, size=14, face="bold"))

The boxplot shows the distribution of median sales prices for each city.

Bryan-College Station is the city with the highest sales prices, while Wichita Falls is the city with the lowest prices. There are no major differences in the level of variability and symmetry of the data distributions between cities. We can note, however, that Beaumont, Bryan-College Station, and Wichita Falls have some outliers.

#monthly sales distribution

library(dplyr)  
## 
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
## 
##     filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
## 
##     intersect, setdiff, setequal, union
sales_per_month=data %>%
  group_by(city, month) %>%
  summarise(total_sales=sum(sales))
## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.
library(ggplot2)
ggplot(data=sales_per_month)+
  geom_col(aes(x=month,
               y=total_sales,
               fill=city),
           position="stack",
           col="black")+
  labs(title="Monthly Sales Distribution",
       x="Months",
       y="Absolute frequencies")+
  scale_x_continuous(breaks=seq(1,12,1))+
  theme_classic()

The graph shows a monthly distribution of sales summed for the 5 years.

It is noted that the city that recorded the least sales overall is Wichita Falls, followed by Beaumont, Bryan-College Station and finally Tyler. The months in which the most sales were recorded overall are the summer months, from May to August, with a peak in June. Instead, the months with the least sales are January and February.

#normalized monthly sales distribution

N=sum(sales_per_month$total_sales)
rel_freq=(sales_per_month$total_sales)/N

sales_per_month$rel_freq=rel_freq

library(ggplot2)
ggplot(data=sales_per_month)+
  geom_col(aes(x=month,
               y=rel_freq,
               fill=city),
           position="fill",
           col="black")+
  labs(title="Relative Monthly Sales Distribution",
       x="Months",
       y="Relative frequencies")+
  scale_x_continuous(breaks=seq(1,12,1))+
  scale_y_continuous(breaks=seq(0,1,0.1))

The normalized bar chart confirms the trend of individual cities.

# annual sales distribution

library(dplyr)  

sales_per_year=data %>%
  group_by(city, year) %>%
  summarise(total_sales=sum(sales))
## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.
library(ggplot2)
ggplot(data=sales_per_year)+
  geom_col(aes(x=year,
               y=total_sales,
               fill=city),
           position="stack",
           col="black")+
  labs(title="Annual Sales Distribution",
       x="Years",
       y="Absolute frequencies")+
  theme_classic()

The graph shows the sales trend over the 5 years. Over the years, there is an improvement, especially in 2014.

The city that contributed the most to this growth is Tyler, followed by Bryan-College Station, Beaumont and finally Wichita Falls. Furthermore, the latter has a rather static trend.

# Average sales price per year

average_volume=data%>%
  group_by(city, year) %>%
  summarise(average=mean(volume),
            st_dev=sd(volume))
## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.
ggplot(data=average_volume)+
  geom_col(aes(x=year,
               y=average,
               fill=city),
           position="dodge",
           col="black")+
  labs(title="Average sales price per year",
       x="Years",
       y="Average Price (million dollars)")+
  theme_classic()

The graph shows the average sales price by year and city.

The city with the highest average price is Tyler, followed by Bryan-College Station, Beaumont, and Wichita Falls. It is also noted that over the years there has been an increase in the average price for all cities except Wichita Falls.

# boxplot for sales price distribution

ggplot(data=data)+
  geom_boxplot(aes(x=city,
                   y=volume),
               fill="lightblue")+
  labs(x="City",
       y="Sales Price")+
  ggtitle("Sales price distribution")+
  theme(plot.title = element_text(hjust = 0.5, size=14, face="bold"))

The graph confirms Tyler as the city with the highest number of high prices and shows how Bryan-College Station has the most variability in the data distribution.

library(dplyr)
library(ggplot2)

# line chart for annual sales distribution

data$date=as.Date(with(data, paste(year, month, "01", sep="-")), "%Y-%m-%d")
ggplot(data=data)+
  geom_line(aes(x=date, y=sales, color=city), lwd=1)+
  labs(x="Date",
       y="Sales",
       title="Annual sales distribution")+
  scale_x_date(date_breaks = "3 month", date_labels = "%Y-%m")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The graph confirms what was found with the previous graphs. That is, Tyler is the most consistent with high sales, while Wichita Falls is the most consistent with low sales. Bryan-College Station is the most variable. It is interesting to underline how the sales peaks coincide with the spring-summer periods.

CONCLUSIONS

From this statistical analysis regarding the Texas region we can highlight the following conclusions:

Therefore, we can conclude that Tyler has the best market strategy (also considering the number of active ads and the months_inventory). It is advisable to review the Wichita Falls strategy.