Set the right working directory and verify the files in it

setwd("C:/Users/TommasoScortecci/Test Finale")
getwd()
## [1] "C:/Users/TommasoScortecci/Test Finale"
dir()
## [1] "Progetto Finale.R"         "progettoRprofessionai.Rmd"
## [3] "realestate_texas.csv"

Import the csv file and verify the structure and dimensions

df <- read.csv("realestate_texas.csv", sep=",")
str(df) # variables analysis
## 'data.frame':    240 obs. of  8 variables:
##  $ city            : chr  "Beaumont" "Beaumont" "Beaumont" "Beaumont" ...
##  $ year            : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ month           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ sales           : int  83 108 182 200 202 189 164 174 124 150 ...
##  $ volume          : num  14.2 17.7 28.7 26.8 28.8 ...
##  $ median_price    : num  163800 138200 122400 123200 123100 ...
##  $ listings        : int  1533 1586 1689 1708 1771 1803 1857 1830 1829 1779 ...
##  $ months_inventory: num  9.5 10 10.6 10.6 10.9 11.1 11.7 11.6 11.7 11.5 ...
dim(df) # dataframe analysis
## [1] 240   8
attach(df)

Quick analysis of the dataset’s variables

summary(df) # quartiles
##      city                year          month           sales      
##  Length:240         Min.   :2010   Min.   : 1.00   Min.   : 79.0  
##  Class :character   1st Qu.:2011   1st Qu.: 3.75   1st Qu.:127.0  
##  Mode  :character   Median :2012   Median : 6.50   Median :175.5  
##                     Mean   :2012   Mean   : 6.50   Mean   :192.3  
##                     3rd Qu.:2013   3rd Qu.: 9.25   3rd Qu.:247.0  
##                     Max.   :2014   Max.   :12.00   Max.   :423.0  
##      volume        median_price       listings    months_inventory
##  Min.   : 8.166   Min.   : 73800   Min.   : 743   Min.   : 3.400  
##  1st Qu.:17.660   1st Qu.:117300   1st Qu.:1026   1st Qu.: 7.800  
##  Median :27.062   Median :134500   Median :1618   Median : 8.950  
##  Mean   :31.005   Mean   :132665   Mean   :1738   Mean   : 9.193  
##  3rd Qu.:40.893   3rd Qu.:150050   3rd Qu.:2056   3rd Qu.:10.950  
##  Max.   :83.547   Max.   :180000   Max.   :3296   Max.   :14.900
head(df) # first 6 rows
##       city year month sales volume median_price listings months_inventory
## 1 Beaumont 2010     1    83 14.162       163800     1533              9.5
## 2 Beaumont 2010     2   108 17.690       138200     1586             10.0
## 3 Beaumont 2010     3   182 28.701       122400     1689             10.6
## 4 Beaumont 2010     4   200 26.819       123200     1708             10.6
## 5 Beaumont 2010     5   202 28.833       123100     1771             10.9
## 6 Beaumont 2010     6   189 27.219       122800     1803             11.1
df$month_year <- paste(month, "/", year)
The dataset contains 240 rows and 8 variables:
-‘city’ categorical qualitative
-‘year’ ordinal qualitative
-‘month’ ordinal qualitative
-‘sales’ discrete quantitative
-‘volume’ continuous quantitative
-‘median_price’ discrete quantitative
-‘listings’ discrete quantitative
-‘months_inventory’ continuous quantitative
I decided to add the ‘month_year’ variable so I can do analysis on time dimension
We can do analysis on the distribution of quantitative variables by qualitative variables

Moments library

library(moments)

Position, variability and shape indexes and frequencies

# 'Sales' variable
mean(sales)
## [1] 192.2917
quantile(sales)
##    0%   25%   50%   75%  100% 
##  79.0 127.0 175.5 247.0 423.0
IQR(sales)
## [1] 120
sd(sales)
## [1] 79.65111
skewness(sales) # positive skewness
## [1] 0.718104
kurtosis(sales) - 3 # platicurtic
## [1] -0.3131764
# 'Volume' variable
mean(volume)
## [1] 31.00519
quantile(volume)
##      0%     25%     50%     75%    100% 
##  8.1660 17.6595 27.0625 40.8930 83.5470
IQR(volume)
## [1] 23.2335
sd(volume)
## [1] 16.65145
skewness(volume) # positive skewness
## [1] 0.884742
kurtosis(volume) - 3 # leptocurtic
## [1] 0.176987
# 'Median_price' variable
mean(median_price)
## [1] 132665.4
quantile(median_price)
##     0%    25%    50%    75%   100% 
##  73800 117300 134500 150050 180000
IQR(median_price)
## [1] 32750
sd(median_price)
## [1] 22662.15
skewness(median_price) # negative skewness
## [1] -0.3645529
kurtosis(median_price) - 3 # platicurtic
## [1] -0.6229618
# 'Listings' variable
mean(listings)
## [1] 1738.021
quantile(listings)
##     0%    25%    50%    75%   100% 
##  743.0 1026.5 1618.5 2056.0 3296.0
IQR(listings)
## [1] 1029.5
sd(listings)
## [1] 752.7078
skewness(listings) # positive skewness
## [1] 0.6494982
kurtosis(listings) - 3 # platicurtic
## [1] -0.79179
# 'Months_inventory' variable
mean(months_inventory)
## [1] 9.1925
quantile(months_inventory)
##    0%   25%   50%   75%  100% 
##  3.40  7.80  8.95 10.95 14.90
IQR(months_inventory)
## [1] 3.15
sd(months_inventory)
## [1] 2.303669
skewness(months_inventory) # positive skewness
## [1] 0.04097527
kurtosis(months_inventory) - 3 # platicurtic
## [1] -0.1744475
# Graphic analysis for 'volume'
boxplot(volume, col = "lightblue",
        main = "Volume Boxplot (mln)",
        ylab = "Volume") # few outliers

hist(volume, probability = T, col = "lightblue",
     main = "Volume trend analysis (mln)",
     xlab = "Volume",
     ylab = "Density")
lines(density(volume)) # we can clearly see the positive skewness

# 'City', 'year', 'month' and 'month_year' frequency
table(city)
## city
##              Beaumont Bryan-College Station                 Tyler 
##                    60                    60                    60 
##         Wichita Falls 
##                    60
table(year)
## year
## 2010 2011 2012 2013 2014 
##   48   48   48   48   48
table(month)
## month
##  1  2  3  4  5  6  7  8  9 10 11 12 
## 20 20 20 20 20 20 20 20 20 20 20 20
table(df$month_year)
## 
##  1 / 2010  1 / 2011  1 / 2012  1 / 2013  1 / 2014 10 / 2010 10 / 2011 10 / 2012 
##         4         4         4         4         4         4         4         4 
## 10 / 2013 10 / 2014 11 / 2010 11 / 2011 11 / 2012 11 / 2013 11 / 2014 12 / 2010 
##         4         4         4         4         4         4         4         4 
## 12 / 2011 12 / 2012 12 / 2013 12 / 2014  2 / 2010  2 / 2011  2 / 2012  2 / 2013 
##         4         4         4         4         4         4         4         4 
##  2 / 2014  3 / 2010  3 / 2011  3 / 2012  3 / 2013  3 / 2014  4 / 2010  4 / 2011 
##         4         4         4         4         4         4         4         4 
##  4 / 2012  4 / 2013  4 / 2014  5 / 2010  5 / 2011  5 / 2012  5 / 2013  5 / 2014 
##         4         4         4         4         4         4         4         4 
##  6 / 2010  6 / 2011  6 / 2012  6 / 2013  6 / 2014  7 / 2010  7 / 2011  7 / 2012 
##         4         4         4         4         4         4         4         4 
##  7 / 2013  7 / 2014  8 / 2010  8 / 2011  8 / 2012  8 / 2013  8 / 2014  9 / 2010 
##         4         4         4         4         4         4         4         4 
##  9 / 2011  9 / 2012  9 / 2013  9 / 2014 
##         4         4         4         4
Most of the variables present positive skeweness; it indicates that the lowest values are more frequent.
If we take as an example the ‘sales’ variabile we can see a good amount of variance between datas with an extense range of datas.
I analysed the ‘volume’ variable with 2 graphs: a boxplot to see the quantiles and the outliers and a histogram (with the density curve) to see the frequency for each class (to see the positive skewness)

Comparisons between variables for variability and skewness

# Coefficient of variation function; it is necessary in order to compare variables with different units of measurement
CV <- function(datas, column){
  sd_v = sd(datas[[column]])
  mean_v = mean(datas[[column]])
  
  return(sd_v/mean_v*100)
}

# Comparison between some variables to see the one with the highest variability
CV(df, "sales")
## [1] 41.42203
CV(df, "volume") # Highest variability
## [1] 53.70536
CV(df, "median_price")
## [1] 17.08218
CV(df, "listings")
## [1] 43.30833
CV(df, "months_inventory")
## [1] 25.06031
# Histograms of the 5 variables above (to analyse the skewness) 
par(mfrow = c(3, 2))
hist(sales, col = "lightblue")
hist(volume, col = "lightgreen")
hist(median_price, col = "lightgrey")
hist(listings, col = "brown")
hist(months_inventory, col = "yellow")
par(mfrow = c(1, 1))

In order to compare the variability for columns with different unit of measurement we must use the coefficient of variation.
From a statistical point of view the variable ‘volume’ is the one whose values vary greatly relative to the average.
If we see above the skewness functions (where I executed the position, variability and shape indexes) and the histograms, ‘volume’ is the variable with the highest skewness.

Classes for ‘median_price’

# Creation of classes for 'median_price' variable
median_price_class = cut(median_price, breaks = 10, labels = FALSE)
breaks_vec <- seq(min(median_price), max(median_price), length.out=11)

# Function to create labels ad-hoc for the barplot
label <- sapply(1:(length(breaks_vec)-1), function(i){
  paste0(round(breaks_vec[i]/1000), "-", round(breaks_vec[i+1]/1000), "k")
})

# Barplot for 'median_price' divided in 10 classes
barplot(table(median_price_class), names.arg = label, las=2, col = "lightblue",
        main = "Barplot for median_price in classes",
        ylab = "Frequency")

# Calculate Gini index
sum_median_price_class = sum(table(median_price_class))
freq_rel_median_price = round(table(median_price_class)/sum_median_price_class, digits = 3)
Gini <- 1-sum(freq_rel_median_price^2)
Gini # 0.86 -> high eterogenity
## [1] 0.862613
Variable ‘median_price’, divided in classes and plotted with a barplot, has high heterogeneity.
It can be seen also through the Gini index (0.86).

Probabilities

# Probability for 'city' = 'Beaumont'
P_Beaumont = sum(city == 'Beaumont')/length(city)
P_Beaumont # P=1/4
## [1] 0.25
# Probability for 'month' = 'July' (7)
P_July = sum(month == 7)/length(month)
P_July # P=1/12
## [1] 0.08333333
# Probability for 'month' = 'December' (12) and 'year' = 2012
P_Dec_2012 = sum((month == 7) & (year == 2012))/nrow(df)
P_Dec_2012 # P=1/60
## [1] 0.01666667

New variables

# New variable 'mean_price' calculated by 'volume'/'sales'
volume_mln = volume * 10^6
df$mean_price = volume_mln/sales

# New variable 'listings_efficiency' calculated by 'sales'/'listings'. We can obtain how well listings have performed
df$listings_efficiency = round(sales/listings, digits = 3)
df$listings_efficiency_perc = paste((df$listings_efficiency*100), "%")

# Visualize the results of the new variables
hist(df$mean_price, col = "lightgreen",
     main = "Histogram of mean_prices",
     xlab = "Mean_prices")

max_val <- max(df$listings_efficiency, na.rm = TRUE)
idx_max <- which.max(df$listings_efficiency)
city_max <- df$city[idx_max]
month_year_max <- df$month_year[idx_max]

boxplot(df$listings_efficiency, col = "lightblue",
        main = "Listings_efficiency Boxplot",
        xlab = "Listings_efficiency",
        horizontal = TRUE)
points(max_val, 1, col="red", pch=19, cex=1.5)
text(max_val - 0.067, 1.1, labels = paste0(round(max_val,2), " (", city_max, ", ", month_year_max, ")"), col="red")

summary(df$listings_efficiency)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.05000 0.08975 0.10950 0.11876 0.13525 0.38700
From the histogram we can see that ‘mean_prices’ between 140k and 170k are the ones with the highest frequency.
From the boxplot we can see the distribution of ‘listings_efficiency’, calculated by ‘sales’/‘listings’.
The maximum value for ‘listings_efficency’ was registrated in 07/2014 for Bryan-College Station where there were 39 sales for every 100 listings.

Dplyr and GGplot2 libraries

library(dplyr)
## 
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
## 
##     filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

Statistical analysis with graphic representations

# Statistical analysis of 'volume' by cities with graphical representation. We can clearly see the cities with higher variability (Tyler and Bryan-College Station).
volume_per_city <- df %>%
  group_by(city) %>%
  summarise(mean_val_city = mean(volume),
            dev_std_city = sd(volume)
  ) %>%
  mutate(label = paste0(city, " (SD=", round(dev_std_city, 2), ")"))
volume_per_city
## # A tibble: 4 × 4
##   city                  mean_val_city dev_std_city label                        
##   <chr>                         <dbl>        <dbl> <chr>                        
## 1 Beaumont                       26.1         6.97 Beaumont (SD=6.97)           
## 2 Bryan-College Station          38.2        17.2  Bryan-College Station (SD=17…
## 3 Tyler                          45.8        13.1  Tyler (SD=13.11)             
## 4 Wichita Falls                  13.9         3.24 Wichita Falls (SD=3.24)
ggplot(df, aes(x = volume, color = city, fill = city)) +
  geom_density(alpha = 0.3) +
  geom_vline(data = volume_per_city, aes(xintercept = mean_val_city, color = city),
             linetype = "dashed", size = 1) +
  scale_color_manual(
    values = scales::hue_pal()(nrow(volume_per_city)),
    labels = volume_per_city$label
  ) +
  labs(x = "Volume", y = "Density",
       title = "Density curve for cities with highlighting means") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Statistical analysis of 'sales' by years with graphical representation. We can see the variability between the years and the trend over time; higher variability and sales (in average) for 2013 and 2104.
sales_per_year <- df %>%
  group_by(year) %>%
  summarise(mean_val_year = mean(sales),
            dev_std_year = sd(sales))
sales_per_year
## # A tibble: 5 × 3
##    year mean_val_year dev_std_year
##   <int>         <dbl>        <dbl>
## 1  2010          169.         60.5
## 2  2011          164.         63.9
## 3  2012          186.         70.9
## 4  2013          212.         84.0
## 5  2014          231.         95.5
ggplot(df, aes(x=year, y=sales)) +
  geom_jitter(width=0.2, alpha=0.5) +
  stat_summary(fun=mean, geom="point", color="red", size=3) +
  labs(title="Volume per città con punti e media")

# Statistical analysis of 'listings' by months with graphical representation. We can see the 4° month as the one with the highest mean and the 7° with the highest SD.
listings_per_month <- df %>%
  group_by(month) %>%
  summarise(mean_val_month = mean(listings),
            dev_std_month = sd(listings))
listings_per_month
## # A tibble: 12 × 3
##    month mean_val_month dev_std_month
##    <int>          <dbl>         <dbl>
##  1     1          1647.          705.
##  2     2          1692.          711.
##  3     3          1757.          727.
##  4     4          1826.          770.
##  5     5          1824.          790.
##  6     6          1833.          812.
##  7     7          1821.          827.
##  8     8          1786.          816.
##  9     9          1749.          803.
## 10    10          1710.          779.
## 11    11          1653.          741.
## 12    12          1558.          693.
boxplot(listings~month, col = "lightblue",
        main = "Boxplot of listings over months",
        xlab = "Month", ylab = "Listings")

# Boxplot to compare 'median_price' by city. Bryan-College Station has the highest median prices, while Wichita Falls has the lowest median prices.
ggplot(df, aes(x=city, y=median_price)) +
  geom_boxplot() +
  labs(x="City", y="Median_price",
       title="Boxplot of median_price by city")

# Barplots: 'sales' by 'month' and 'sales' by 'city'. The highest amount of sales are between May and August and for Tyler, while the lowest sales are in January and for Wichita Falls. I also created a stacked barplot to see the 'sales' bu 'city' and 'month' (Tyler and Bryan-College Station have the highest values).
ggplot(df, aes(x=month, y=sales)) +
  geom_col(fill="blue") +
  labs(x="Month", y="Sales",
       title="Barplot of sales by month") +
  scale_x_continuous(
    breaks = 1:12,
    labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
               "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
  ) +
  theme_minimal()

ggplot(df, aes(x=month, y=sales, fill=city)) +
  geom_bar(stat="identity") +
  labs(x="Month", y="Sales",
       title="Stacked barplot of sales by month and city") +
  scale_x_continuous(
    breaks = 1:12,
    labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
               "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
  ) +
  theme_minimal()

ggplot(df, aes(x=city, y=sales)) +
  geom_col(fill="violet") +
  labs(x="City", y="Sales",
       title="Barplot of sales by city") +
  theme_minimal()

# Linecharts for 'sales' trend over the years by 'city'. For every city the 2014 and 2013 where the years with highest sales.
ggplot(df, aes(x = month, y = sales, color=factor(year), group=year)) +
  geom_line() +
  geom_point() +
  facet_wrap(~ city, nrow = 2, ncol = 2) +
  labs(
    title = "Monthly sales trend by year",
    x = "Month",
    y = "Sales"
  ) +
  scale_x_continuous(
    breaks = 1:12,
    labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
               "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
  ) +
  theme_minimal()

# Linecharts of 'volume' by 'year' (in average) for every 'city'. They all seem to have an upward trend. Tyler has the highest values, while Whicita Falls has a great drop in 2011.
df_Beaumont <- subset(df, city == "Beaumont")
df_agg_Beaumont <- aggregate(volume ~ year, data = df_Beaumont, FUN = mean)
plot(df_agg_Beaumont$year, df_agg_Beaumont$volume, type="l", col="blue", lwd=2,
     main="Linechart of mean volume (mln) by year for Beaumont",
     xlab="Year", ylab="Volume (mln)")

df_Bryan_College_Station <- subset(df, city == "Bryan-College Station")
df_agg_Bryan_College_Station <- aggregate(volume ~ year, data = df_Bryan_College_Station, FUN = mean)
plot(df_agg_Bryan_College_Station$year, df_agg_Bryan_College_Station$volume, type="l", col="red", lwd=2,
     main="Linechart of mean volume (mln) by year for Bryan-College Station",
     xlab="Year", ylab="Volume (mln)")

df_Tyler <- subset(df, city == "Tyler")
df_agg_Tyler <- aggregate(volume ~ year, data = df_Tyler, FUN = mean)
plot(df_agg_Tyler$year, df_agg_Tyler$volume, type="l", col="green", lwd=2,
     main="Linechart of mean volume (mln) by year for Tyler",
     xlab="Year", ylab="Volume (mln)")

df_Wichita_Falls <- subset(df, city == "Wichita Falls")
df_agg_Wichita_Falls <- aggregate(volume ~ year, data = df_Wichita_Falls, FUN = mean)
plot(df_agg_Wichita_Falls$year, df_agg_Wichita_Falls$volume, type="l", col="orange", lwd=2,
     main="Linechart of mean volume (mln) by year for Wichita Falls",
     xlab="Year", ylab="Volume (mln)")

————————–

CONCLUSIONS

————————–

We can also highlight a fall in volumes for Wichita Falls in 2011.

If we compare monthly every year for every city, quantity of sales have the highest values in 2013 and 2014, with a peak in the central months (May-Aug) for Bryan-College Station.

The efficacy of listings (that become sales) is between 5% and 20% for almost every observation, with few outliers.

We can highlight the efficacy of 39% for Bryan-College Station in July 2014. That is the maximum value registered.

The highest classes of median price registered were 127k-138k and 148k-159k. These amounts are nearer to the highest value of median price than to the lowest.