Texas Real Estate Market Analysis

Variable analysis

import the dataset

getwd()

## [1] "C:/Users/PC/Desktop/sara/master/Statistica descrittiva"

setwd("C:/Users/PC/Desktop")
dati = read.csv('realestate_texas.csv', sep=',') 

dim(dati)

## [1] 240   8

N=dim(dati)[1]
attach(dati)

the dataset is composed of 240 objects and 8 variables

city: nominal qualitative variable;
year: ordinal qualitative variable;
month: nominal qualitative variable (cyclic);
sales: discrete quantitative variable;
volume: continuous quantitative variable;
median_price: continuous quantitative variable;
listings: discrete quantitative variable;
months_inventory: continuous quantitative variable;

Position, variability and shape indices:

mode: variable with four modes, all with an absolute frequency of 60.

moda = table(city)
kable(moda)

city	Freq
Beaumont	60
Bryan-College Station	60
Tyler	60
Wichita Falls	60

city has a four-modal distribution.

Frequency distribution: year and month:

year

freq_ass_y = table(year) # Absolute frequency: 48 for each year.
freq_rel_y = table(year)/N # Relative frequency: 0.2 (20%) for each year.
distr_freq_year = cbind(freq_ass_y,freq_rel_y)
kable(distr_freq_year)

	freq_ass_y	freq_rel_y
2010	48	0.2
2011	48	0.2
2012	48	0.2
2013	48	0.2
2014	48	0.2

month: this variable is equally distributed and occurs 20 times in the dataset each month

freq_ass_m = table(month) 
freq_rel_m = table(month)/N 
distr_freq_month = round(cbind(freq_ass_m,freq_rel_m),2)
kable(distr_freq_month)

freq_ass_m	freq_rel_m
20	0.08
20	0.08
20	0.08
20	0.08
20	0.08
20	0.08
20	0.08
20	0.08
20	0.08
20	0.08
20	0.08
20	0.08

Calculating indices: I create a function for quantitative variables

index_calculation = function(x) {
  c(mean = mean(x),
    median = median(x),
    devst = sd(x),
    min = min(x),
    max = max(x),
    Q1 = quantile(x, 0.25),
    Q3 = quantile(x, 0.75),
    IQR = IQR(x),
    kurtosis = kurtosis(x),
    skewness = skewness(x)
  )
}

I apply the function to the variables and create a dataset ‘stat_df’ to see the statistics.

var_quant = c('sales', 'volume', 'median_price', 'listings', 'months_inventory')
stat = sapply(dati[var_quant], index_calculation)
stat_df = as.data.frame(round(stat,2))
kable(stat_df)

	sales	volume	median_price	listings	months_inventory
mean	192.29	31.01	132665.42	1738.02	9.19
median	175.50	27.06	134500.00	1618.50	8.95
devst	79.65	16.65	22662.15	752.71	2.30
min	79.00	8.17	73800.00	743.00	3.40
max	423.00	83.55	180000.00	3296.00	14.90
Q1.25%	127.00	17.66	117300.00	1026.50	7.80
Q3.75%	247.00	40.89	150050.00	2056.00	10.95
IQR	120.00	23.23	32750.00	1029.50	3.15
kurtosis	2.69	3.18	2.38	2.21	2.83
skewness	0.72	0.88	-0.36	0.65	0.04

Identifying variables with greater variability and skewness

Coefficient of variation (CV)

cv = round(sapply(dati[var_quant],
            function(x){return(sd(x)/mean(x)*100)}),2)
kable(cv)

	x
sales	41.42
volume	53.71
median_price	17.08
listings	43.31
months_inventory	25.06

Volume has a higher coefficient of variation than the other variables, with a percentage of 53.71%. Volume has a negative and leptokurtic skewed distribution and a greater asymmetry than the other variables (skewness=88%)

Creating classes for a quantitative variable

dati$sales_cl = cut(dati$sales,#I used the cut function to split the sales data into specific classes
               breaks = c(79,150,250,350,423),
               include.lowest = TRUE, #to include the lower limit of the first class.
               right = FALSE) # to make the interval closed on the left and open on the right.

Calculating frequencies:

ni = table(dati$sales_cl)
fi = round((ni/N),2)
Ni = cumsum(ni)
Fi = round((Ni/N),2)
cbind(ni,fi,Ni,Fi)

##           ni   fi  Ni   Fi
## [79,150)  89 0.37  89 0.37
## [150,250) 91 0.38 180 0.75
## [250,350) 48 0.20 228 0.95
## [350,423] 12 0.05 240 1.00

I combined these results into a dataframe that includes all the calculated frequencies.

distr_freq_sales_cl = as.data.frame((cbind(ni,fi,Ni,Fi)))
kable(distr_freq_sales_cl)

	ni	fi	Ni	Fi
[79,150)	89	0.37	89	0.37
[150,250)	91	0.38	180	0.75
[250,350)	48	0.20	228	0.95
[350,423]	12	0.05	240	1.00

Gini index for distr_freq_sales_cl

gini.index = function(x){
  ni = table(x)
  fi = ni/length(x)
  fi2 = fi^2
  J= length(table(x))
  
  gini = 1-sum(fi2)
  gini.norm = gini/((J-1)/J)
  
  return(gini.norm)
}
  
round(gini.index(distr_freq_sales_cl),2)

## [1] 0.75

G’= 0.75 indicates a rather unequal distribution between classes

a bar chart showing the distribution of absolute frequencies for each sales class.

barplot(ni,
        main = 'frequency distribution',
        xlab = 'length classes',
        ylab = 'absolute frequencies',
        col = 'lightblue')

Sales are mainly concentrated in the first two classes: [79,150) and [150,250)

Gini index for the city variable

gini.index(city)

## [1] 1

G’ = 1 Maximum heterogeneity. This happens because the distribution of cities is equally divided (each has a relative frequency of 0.25)

Probability calculation

probability_city = sum(dati$city == 'Beaumont')/nrow(dati)
kable(data.frame("Probability city:" = probability_city))

Probability.city.
0.25

The probability that the city is “Beaumont” is 25%.

probability_month = round(sum(month == '7')/nrow(dati),3)
kable(data.frame("Probability month:" = probability_month))

Probability.month.
0.083

The probability that the month is “7” is 8.3%.

probability_date = round(sum(dati$month == '12' & dati$year == '2012')/nrow(dati),3)
kable(data.frame("Probability date:" = probability_date))

Probability.date.
0.017

The probability that the month is “12” and the year is “2012” is 0.0167 (1.7%).

I created a dataframe called ‘prob’ with the events and probabilities:

prob = data.frame(
  Evento = c("Beaumont", "Luglio", "Dicembre 2012"),
  Probabilità = c(probability_city, probability_month, probability_date)
)

Creating new variables

I created a new column that calculates the mean price of properties.

mean_price_c = mutate(
  dati,
  mean_price = round((volume/sales),2)) %>%
  select(volume,sales,mean_price) %>% head(5)

kable(mean_price_c)

volume	sales	mean_price
14.162	83	0.17
17.690	108	0.16
28.701	182	0.16
26.819	200	0.13
28.833	202	0.14

The mean price decreases as sales increase

I created a column that measures the efficiency listings of sales ads.

efficiency_listings_c = mutate(
  dati,
  efficiency_listings = round((sales/listings),2)) %>%
  select(,sales,listings, efficiency_listings)%>% head(5)

kable(efficiency_listings_c)

sales	listings	efficiency_listings
83	1533	0.05
108	1586	0.07
182	1689	0.11
200	1708	0.12
202	1771	0.11

Ad efficiency listings appears to be low, ranging from 5% to 11%.

Conditional analysis

Statistics by year

stat_by_year =dati %>%
  group_by(year) %>%
  summarise(
    mean = round((mean(listings, na.rm = TRUE)),2), # Mean sales
    sd = round((sd(listings, na.rm = TRUE)),2), # Standard deviation of sales.
    min = min(listings), # Min value of sales.
    max = max(listings) # Max value of sales.
  )
kable(stat_by_year)

year	mean	sd	min	max
2010	1826.00	785.02	904	3296
2011	1849.65	780.38	844	3266
2012	1776.81	738.45	801	3072
2013	1677.60	743.52	743	2998
2014	1560.04	706.71	746	2875

The results show a decline in mean sales from 2010 to 2014, with a more pronounced decline starting in 2013.

Statistics by year, month and city

stat_tot = dati %>%
  group_by(city, month, year) %>%
  summarise(
    mean_sales = mean(sales, na.rm = TRUE),
    .groups = 'drop'
  )

VIEWS WITH GGPLOT2

Boxplot: Distribution of average house price among cities

ggplot(dati, aes(x=city,y=median_price))+
  geom_boxplot(fill="lightblue")+
  labs(
    title = 'Distribution of average house price among cities',
    x = 'city',
    y= 'median price'
  ) +
  theme_light() +
  theme(
    plot.title = element_text(hjust = 0.5, face='bold'),
    axis.title.x = element_text(face = 'italic'),
    axis.title.y = element_text(face = 'italic')
  )

The boxplot graph shows the median price on the y-axis and the cities on the x-axis. Points outside the boxplot are outliers, which are found in all cities except Tyler. Also, Wichita Falls has a lower median price than other cities.

Boxplot: distribution of the total value of sales between the various cities and between the various years

ggplot(dati, aes(x=factor(year),y=volume, fill = city))+
  geom_boxplot()+
  labs(
    title = 'distribution of the total value of sales between the various cities and between the various years',
    x = 'year',
    y= 'total sales',
    fill = 'city'
  ) +
  theme_light() +
  theme(
    plot.title = element_text(hjust = 0.5,face='bold'),
    axis.title.x = element_text(face = 'italic'),
    axis.title.y = element_text(face = 'italic')
  )

The graph shows the total sales over the years in the 4 cities. For each year, indicated on the x-axis, the total value of sales for each city is represented by a boxplot of a different color. The highest sales seem to occur in the cities of Bryan-College Station and Tyler, especially in the years 2013-2014. For the city of Beaumont there are outliers in 2012.

Stacked bar chart: distribution of the total value of sales

ggplot(dati)+
  geom_bar(aes(x=factor(month),y=volume, fill = city), stat = "identity")+
  labs(
    title = 'distribution of the total value of sales',
    x = 'month',
    y = 'total sales',
    fill = 'city'
  )+
  theme_light() +
  theme(
    plot.title = element_text(hjust = 0.5,face='bold'),
    axis.title.x = element_text(face = 'italic'),
    axis.title.y = element_text(face = 'italic')
  )

The graph represents the total sales for each city over the months. The months are shown on the x-axis, while the total sales are shown on the y-axis. The different colors represent the 4 cities. From the graph thus obtained we can conclude that greater sales accumulate in the summer months (5,6,7,8), especially for the cities of Bryan College station and Tyler

Stacked bar chart: normalized distribution of the total value of sales

ggplot(dati)+
  geom_bar(aes(x=factor(month),y=volume, fill = city), 
           stat = "identity", position = 'fill')+
  labs(
    title = 'normalized distribution of the total value of sales',
    x = 'month',
    y = 'total sales'
  )+
  theme_light() +
  theme(
    plot.title = element_text(hjust = 0.5,face='bold'),
    axis.title.x = element_text(face = 'italic'),
    axis.title.y = element_text(face = 'italic')
  )

The normalized graph represents total sales for each city over the months.

time series of the total value of sales

date = as.Date(paste(dati$year, dati$month, '01', sep = '-'))

ggplot(dati, aes(x=date, y=sales, color=city))+
  
  geom_line(linewidth=1)+
  geom_point(size=2)+
  
  scale_color_discrete(name = "city") +
  
  labs(
    title = 'time series of the total value of sales',
    x ='date',
    y = 'total sales'
  )+
  scale_x_date(date_breaks = '2 months', 
               date_labels ='%b',
               expand= c(0.02, 0.02)
               )+
  theme_light()+
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = 'bottom',
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = 'gray90'),
    strip.text = element_text(face = 'bold')
  ) + 
  facet_wrap(~year, ncol = 3, scales = 'free_x')

The line charts show the total sales of the 4 cities over the years (2010-2014). The x-axis shows the months, while the y-axis shows the total sales for each city. Looking at the chart, we can conclude that the city of Tyler, compared to the other cities, has had the most sales since 2010 and continues to grow until 2014. On the contrary, the city with the least sales is Wichita Falls, where total sales remain constant from 2010 to 2014.

Texas Real Estate Market Analysis

Sara Greco