1. Variable analysis
getwd()
## [1] "C:/Users/PC/Desktop/sara/master/Statistica descrittiva"
setwd("C:/Users/PC/Desktop")
dati = read.csv('realestate_texas.csv', sep=',') 

dim(dati)
## [1] 240   8
N=dim(dati)[1]
attach(dati)

the dataset is composed of 240 objects and 8 variables

  1. Position, variability and shape indices:
moda = table(city)
kable(moda)
city Freq
Beaumont 60
Bryan-College Station 60
Tyler 60
Wichita Falls 60

city has a four-modal distribution.

Frequency distribution: year and month:

freq_ass_y = table(year) # Absolute frequency: 48 for each year.
freq_rel_y = table(year)/N # Relative frequency: 0.2 (20%) for each year.
distr_freq_year = cbind(freq_ass_y,freq_rel_y)
kable(distr_freq_year)
freq_ass_y freq_rel_y
2010 48 0.2
2011 48 0.2
2012 48 0.2
2013 48 0.2
2014 48 0.2
freq_ass_m = table(month) 
freq_rel_m = table(month)/N 
distr_freq_month = round(cbind(freq_ass_m,freq_rel_m),2)
kable(distr_freq_month)
freq_ass_m freq_rel_m
20 0.08
20 0.08
20 0.08
20 0.08
20 0.08
20 0.08
20 0.08
20 0.08
20 0.08
20 0.08
20 0.08
20 0.08

Calculating indices: I create a function for quantitative variables

index_calculation = function(x) {
  c(mean = mean(x),
    median = median(x),
    devst = sd(x),
    min = min(x),
    max = max(x),
    Q1 = quantile(x, 0.25),
    Q3 = quantile(x, 0.75),
    IQR = IQR(x),
    kurtosis = kurtosis(x),
    skewness = skewness(x)
  )
}

I apply the function to the variables and create a dataset ‘stat_df’ to see the statistics.

var_quant = c('sales', 'volume', 'median_price', 'listings', 'months_inventory')
stat = sapply(dati[var_quant], index_calculation)
stat_df = as.data.frame(round(stat,2))
kable(stat_df)
sales volume median_price listings months_inventory
mean 192.29 31.01 132665.42 1738.02 9.19
median 175.50 27.06 134500.00 1618.50 8.95
devst 79.65 16.65 22662.15 752.71 2.30
min 79.00 8.17 73800.00 743.00 3.40
max 423.00 83.55 180000.00 3296.00 14.90
Q1.25% 127.00 17.66 117300.00 1026.50 7.80
Q3.75% 247.00 40.89 150050.00 2056.00 10.95
IQR 120.00 23.23 32750.00 1029.50 3.15
kurtosis 2.69 3.18 2.38 2.21 2.83
skewness 0.72 0.88 -0.36 0.65 0.04
  1. Identifying variables with greater variability and skewness

Coefficient of variation (CV)

cv = round(sapply(dati[var_quant],
            function(x){return(sd(x)/mean(x)*100)}),2)
kable(cv)
x
sales 41.42
volume 53.71
median_price 17.08
listings 43.31
months_inventory 25.06

Volume has a higher coefficient of variation than the other variables, with a percentage of 53.71%. Volume has a negative and leptokurtic skewed distribution and a greater asymmetry than the other variables (skewness=88%)

  1. Creating classes for a quantitative variable
dati$sales_cl = cut(dati$sales,#I used the cut function to split the sales data into specific classes
               breaks = c(79,150,250,350,423),
               include.lowest = TRUE, #to include the lower limit of the first class.
               right = FALSE) # to make the interval closed on the left and open on the right.

Calculating frequencies:

ni = table(dati$sales_cl)
fi = round((ni/N),2)
Ni = cumsum(ni)
Fi = round((Ni/N),2)
cbind(ni,fi,Ni,Fi)
##           ni   fi  Ni   Fi
## [79,150)  89 0.37  89 0.37
## [150,250) 91 0.38 180 0.75
## [250,350) 48 0.20 228 0.95
## [350,423] 12 0.05 240 1.00

I combined these results into a dataframe that includes all the calculated frequencies.

distr_freq_sales_cl = as.data.frame((cbind(ni,fi,Ni,Fi)))
kable(distr_freq_sales_cl)
ni fi Ni Fi
[79,150) 89 0.37 89 0.37
[150,250) 91 0.38 180 0.75
[250,350) 48 0.20 228 0.95
[350,423] 12 0.05 240 1.00

Gini index for distr_freq_sales_cl

gini.index = function(x){
  ni = table(x)
  fi = ni/length(x)
  fi2 = fi^2
  J= length(table(x))
  
  gini = 1-sum(fi2)
  gini.norm = gini/((J-1)/J)
  
  return(gini.norm)
}
  
round(gini.index(distr_freq_sales_cl),2)
## [1] 0.75

G’= 0.75 indicates a rather unequal distribution between classes

a bar chart showing the distribution of absolute frequencies for each sales class.

barplot(ni,
        main = 'frequency distribution',
        xlab = 'length classes',
        ylab = 'absolute frequencies',
        col = 'lightblue')

Sales are mainly concentrated in the first two classes: [79,150) and [150,250)

Gini index for the city variable

gini.index(city)
## [1] 1

G’ = 1 Maximum heterogeneity. This happens because the distribution of cities is equally divided (each has a relative frequency of 0.25)

  1. Probability calculation
probability_city = sum(dati$city == 'Beaumont')/nrow(dati)
kable(data.frame("Probability city:" = probability_city))
Probability.city.
0.25

The probability that the city is “Beaumont” is 25%.

probability_month = round(sum(month == '7')/nrow(dati),3)
kable(data.frame("Probability month:" = probability_month))
Probability.month.
0.083

The probability that the month is “7” is 8.3%.

probability_date = round(sum(dati$month == '12' & dati$year == '2012')/nrow(dati),3)
kable(data.frame("Probability date:" = probability_date))
Probability.date.
0.017

The probability that the month is “12” and the year is “2012” is 0.0167 (1.7%).

I created a dataframe called ‘prob’ with the events and probabilities:

prob = data.frame(
  Evento = c("Beaumont", "Luglio", "Dicembre 2012"),
  Probabilità = c(probability_city, probability_month, probability_date)
)
  1. Creating new variables

I created a new column that calculates the mean price of properties.

mean_price_c = mutate(
  dati,
  mean_price = round((volume/sales),2)) %>%
  select(volume,sales,mean_price) %>% head(5)

kable(mean_price_c)
volume sales mean_price
14.162 83 0.17
17.690 108 0.16
28.701 182 0.16
26.819 200 0.13
28.833 202 0.14

I created a column that measures the efficiency listings of sales ads.

efficiency_listings_c = mutate(
  dati,
  efficiency_listings = round((sales/listings),2)) %>%
  select(,sales,listings, efficiency_listings)%>% head(5)

kable(efficiency_listings_c)
sales listings efficiency_listings
83 1533 0.05
108 1586 0.07
182 1689 0.11
200 1708 0.12
202 1771 0.11
  1. Conditional analysis
stat_by_year =dati %>%
  group_by(year) %>%
  summarise(
    mean = round((mean(listings, na.rm = TRUE)),2), # Mean sales
    sd = round((sd(listings, na.rm = TRUE)),2), # Standard deviation of sales.
    min = min(listings), # Min value of sales.
    max = max(listings) # Max value of sales.
  )
kable(stat_by_year)
year mean sd min max
2010 1826.00 785.02 904 3296
2011 1849.65 780.38 844 3266
2012 1776.81 738.45 801 3072
2013 1677.60 743.52 743 2998
2014 1560.04 706.71 746 2875

Statistics by year, month and city

stat_tot = dati %>%
  group_by(city, month, year) %>%
  summarise(
    mean_sales = mean(sales, na.rm = TRUE),
    .groups = 'drop'
  )

VIEWS WITH GGPLOT2

  1. Boxplot: Distribution of average house price among cities
ggplot(dati, aes(x=city,y=median_price))+
  geom_boxplot(fill="lightblue")+
  labs(
    title = 'Distribution of average house price among cities',
    x = 'city',
    y= 'median price'
  ) +
  theme_light() +
  theme(
    plot.title = element_text(hjust = 0.5, face='bold'),
    axis.title.x = element_text(face = 'italic'),
    axis.title.y = element_text(face = 'italic')
  )

The boxplot graph shows the median price on the y-axis and the cities on the x-axis. Points outside the boxplot are outliers, which are found in all cities except Tyler. Also, Wichita Falls has a lower median price than other cities.

  1. Boxplot: distribution of the total value of sales between the various cities and between the various years
ggplot(dati, aes(x=factor(year),y=volume, fill = city))+
  geom_boxplot()+
  labs(
    title = 'distribution of the total value of sales between the various cities and between the various years',
    x = 'year',
    y= 'total sales',
    fill = 'city'
  ) +
  theme_light() +
  theme(
    plot.title = element_text(hjust = 0.5,face='bold'),
    axis.title.x = element_text(face = 'italic'),
    axis.title.y = element_text(face = 'italic')
  )

The graph shows the total sales over the years in the 4 cities. For each year, indicated on the x-axis, the total value of sales for each city is represented by a boxplot of a different color. The highest sales seem to occur in the cities of Bryan-College Station and Tyler, especially in the years 2013-2014. For the city of Beaumont there are outliers in 2012.

  1. Stacked bar chart: distribution of the total value of sales
ggplot(dati)+
  geom_bar(aes(x=factor(month),y=volume, fill = city), stat = "identity")+
  labs(
    title = 'distribution of the total value of sales',
    x = 'month',
    y = 'total sales',
    fill = 'city'
  )+
  theme_light() +
  theme(
    plot.title = element_text(hjust = 0.5,face='bold'),
    axis.title.x = element_text(face = 'italic'),
    axis.title.y = element_text(face = 'italic')
  )

The graph represents the total sales for each city over the months. The months are shown on the x-axis, while the total sales are shown on the y-axis. The different colors represent the 4 cities. From the graph thus obtained we can conclude that greater sales accumulate in the summer months (5,6,7,8), especially for the cities of Bryan College station and Tyler

  1. Stacked bar chart: normalized distribution of the total value of sales
ggplot(dati)+
  geom_bar(aes(x=factor(month),y=volume, fill = city), 
           stat = "identity", position = 'fill')+
  labs(
    title = 'normalized distribution of the total value of sales',
    x = 'month',
    y = 'total sales'
  )+
  theme_light() +
  theme(
    plot.title = element_text(hjust = 0.5,face='bold'),
    axis.title.x = element_text(face = 'italic'),
    axis.title.y = element_text(face = 'italic')
  )

The normalized graph represents total sales for each city over the months.

  1. time series of the total value of sales
date = as.Date(paste(dati$year, dati$month, '01', sep = '-'))

ggplot(dati, aes(x=date, y=sales, color=city))+
  
  geom_line(linewidth=1)+
  geom_point(size=2)+
  
  scale_color_discrete(name = "city") +
  
  labs(
    title = 'time series of the total value of sales',
    x ='date',
    y = 'total sales'
  )+
  scale_x_date(date_breaks = '2 months', 
               date_labels ='%b',
               expand= c(0.02, 0.02)
               )+
  theme_light()+
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = 'bottom',
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = 'gray90'),
    strip.text = element_text(face = 'bold')
  ) + 
  facet_wrap(~year, ncol = 3, scales = 'free_x')

The line charts show the total sales of the 4 cities over the years (2010-2014). The x-axis shows the months, while the y-axis shows the total sales for each city. Looking at the chart, we can conclude that the city of Tyler, compared to the other cities, has had the most sales since 2010 and continues to grow until 2014. On the contrary, the city with the least sales is Wichita Falls, where total sales remain constant from 2010 to 2014.