Set the right working directory and verify the files in it
setwd("C:/Users/TommasoScortecci/Test Finale")
getwd()
## [1] "C:/Users/TommasoScortecci/Test Finale"
dir()
## [1] "Progetto Finale.R" "progettoRprofessionai.Rmd"
## [3] "realestate_texas.csv"
Import the csv file and verify the structure and dimensions
df <- read.csv("realestate_texas.csv", sep=",")
str(df) # variables analysis
## 'data.frame': 240 obs. of 8 variables:
## $ city : chr "Beaumont" "Beaumont" "Beaumont" "Beaumont" ...
## $ year : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ month : int 1 2 3 4 5 6 7 8 9 10 ...
## $ sales : int 83 108 182 200 202 189 164 174 124 150 ...
## $ volume : num 14.2 17.7 28.7 26.8 28.8 ...
## $ median_price : num 163800 138200 122400 123200 123100 ...
## $ listings : int 1533 1586 1689 1708 1771 1803 1857 1830 1829 1779 ...
## $ months_inventory: num 9.5 10 10.6 10.6 10.9 11.1 11.7 11.6 11.7 11.5 ...
dim(df) # dataframe analysis
## [1] 240 8
attach(df)
Quick analysis of the dataset’s variables
summary(df) # quartiles
## city year month sales
## Length:240 Min. :2010 Min. : 1.00 Min. : 79.0
## Class :character 1st Qu.:2011 1st Qu.: 3.75 1st Qu.:127.0
## Mode :character Median :2012 Median : 6.50 Median :175.5
## Mean :2012 Mean : 6.50 Mean :192.3
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.:247.0
## Max. :2014 Max. :12.00 Max. :423.0
## volume median_price listings months_inventory
## Min. : 8.166 Min. : 73800 Min. : 743 Min. : 3.400
## 1st Qu.:17.660 1st Qu.:117300 1st Qu.:1026 1st Qu.: 7.800
## Median :27.062 Median :134500 Median :1618 Median : 8.950
## Mean :31.005 Mean :132665 Mean :1738 Mean : 9.193
## 3rd Qu.:40.893 3rd Qu.:150050 3rd Qu.:2056 3rd Qu.:10.950
## Max. :83.547 Max. :180000 Max. :3296 Max. :14.900
head(df) # first 6 rows
## city year month sales volume median_price listings months_inventory
## 1 Beaumont 2010 1 83 14.162 163800 1533 9.5
## 2 Beaumont 2010 2 108 17.690 138200 1586 10.0
## 3 Beaumont 2010 3 182 28.701 122400 1689 10.6
## 4 Beaumont 2010 4 200 26.819 123200 1708 10.6
## 5 Beaumont 2010 5 202 28.833 123100 1771 10.9
## 6 Beaumont 2010 6 189 27.219 122800 1803 11.1
df$month_year <- paste(month, "/", year)
The dataset contains 240 rows and 8
variables:
-‘city’ categorical qualitative
-‘year’ ordinal qualitative
-‘month’ ordinal qualitative
-‘sales’ discrete quantitative
-‘volume’ continuous quantitative
-‘median_price’ discrete quantitative
-‘listings’ discrete quantitative
-‘months_inventory’ continuous quantitative
I decided to add the ‘month_year’ variable so I can do analysis on
time dimension
We can do analysis on the distribution of quantitative variables by
qualitative variables
Moments library
library(moments)
Position, variability and shape indexes and frequencies
# 'Sales' variable
mean(sales)
## [1] 192.2917
quantile(sales)
## 0% 25% 50% 75% 100%
## 79.0 127.0 175.5 247.0 423.0
IQR(sales)
## [1] 120
sd(sales)
## [1] 79.65111
skewness(sales) # positive skewness
## [1] 0.718104
kurtosis(sales) - 3 # platicurtic
## [1] -0.3131764
# 'Volume' variable
mean(volume)
## [1] 31.00519
quantile(volume)
## 0% 25% 50% 75% 100%
## 8.1660 17.6595 27.0625 40.8930 83.5470
IQR(volume)
## [1] 23.2335
sd(volume)
## [1] 16.65145
skewness(volume) # positive skewness
## [1] 0.884742
kurtosis(volume) - 3 # leptocurtic
## [1] 0.176987
# 'Median_price' variable
mean(median_price)
## [1] 132665.4
quantile(median_price)
## 0% 25% 50% 75% 100%
## 73800 117300 134500 150050 180000
IQR(median_price)
## [1] 32750
sd(median_price)
## [1] 22662.15
skewness(median_price) # negative skewness
## [1] -0.3645529
kurtosis(median_price) - 3 # platicurtic
## [1] -0.6229618
# 'Listings' variable
mean(listings)
## [1] 1738.021
quantile(listings)
## 0% 25% 50% 75% 100%
## 743.0 1026.5 1618.5 2056.0 3296.0
IQR(listings)
## [1] 1029.5
sd(listings)
## [1] 752.7078
skewness(listings) # positive skewness
## [1] 0.6494982
kurtosis(listings) - 3 # platicurtic
## [1] -0.79179
# 'Months_inventory' variable
mean(months_inventory)
## [1] 9.1925
quantile(months_inventory)
## 0% 25% 50% 75% 100%
## 3.40 7.80 8.95 10.95 14.90
IQR(months_inventory)
## [1] 3.15
sd(months_inventory)
## [1] 2.303669
skewness(months_inventory) # positive skewness
## [1] 0.04097527
kurtosis(months_inventory) - 3 # platicurtic
## [1] -0.1744475
# Graphic analysis for 'volume'
boxplot(volume, col = "lightblue",
main = "Volume Boxplot (mln)",
ylab = "Volume") # few outliers

hist(volume, probability = T, col = "lightblue",
main = "Volume trend analysis (mln)",
xlab = "Volume",
ylab = "Density")
lines(density(volume)) # we can clearly see the positive skewness

# 'City', 'year', 'month' and 'month_year' frequency
table(city)
## city
## Beaumont Bryan-College Station Tyler
## 60 60 60
## Wichita Falls
## 60
table(year)
## year
## 2010 2011 2012 2013 2014
## 48 48 48 48 48
table(month)
## month
## 1 2 3 4 5 6 7 8 9 10 11 12
## 20 20 20 20 20 20 20 20 20 20 20 20
table(df$month_year)
##
## 1 / 2010 1 / 2011 1 / 2012 1 / 2013 1 / 2014 10 / 2010 10 / 2011 10 / 2012
## 4 4 4 4 4 4 4 4
## 10 / 2013 10 / 2014 11 / 2010 11 / 2011 11 / 2012 11 / 2013 11 / 2014 12 / 2010
## 4 4 4 4 4 4 4 4
## 12 / 2011 12 / 2012 12 / 2013 12 / 2014 2 / 2010 2 / 2011 2 / 2012 2 / 2013
## 4 4 4 4 4 4 4 4
## 2 / 2014 3 / 2010 3 / 2011 3 / 2012 3 / 2013 3 / 2014 4 / 2010 4 / 2011
## 4 4 4 4 4 4 4 4
## 4 / 2012 4 / 2013 4 / 2014 5 / 2010 5 / 2011 5 / 2012 5 / 2013 5 / 2014
## 4 4 4 4 4 4 4 4
## 6 / 2010 6 / 2011 6 / 2012 6 / 2013 6 / 2014 7 / 2010 7 / 2011 7 / 2012
## 4 4 4 4 4 4 4 4
## 7 / 2013 7 / 2014 8 / 2010 8 / 2011 8 / 2012 8 / 2013 8 / 2014 9 / 2010
## 4 4 4 4 4 4 4 4
## 9 / 2011 9 / 2012 9 / 2013 9 / 2014
## 4 4 4 4
Most of the variables present positive skeweness;
it indicates that the lowest values are more frequent.
If we take as an example the ‘sales’ variabile we can see a good
amount of variance between datas with an extense range of datas.
I analysed the ‘volume’ variable with 2 graphs: a
boxplot to see the quantiles and the outliers and a
histogram (with the density curve) to see the frequency
for each class (to see the positive skewness)
Comparisons between variables for variability and skewness
# Coefficient of variation function; it is necessary in order to compare variables with different units of measurement
CV <- function(datas, column){
sd_v = sd(datas[[column]])
mean_v = mean(datas[[column]])
return(sd_v/mean_v*100)
}
# Comparison between some variables to see the one with the highest variability
CV(df, "sales")
## [1] 41.42203
CV(df, "volume") # Highest variability
## [1] 53.70536
CV(df, "median_price")
## [1] 17.08218
CV(df, "listings")
## [1] 43.30833
CV(df, "months_inventory")
## [1] 25.06031
# Histograms of the 5 variables above (to analyse the skewness)
par(mfrow = c(3, 2))
hist(sales, col = "lightblue")
hist(volume, col = "lightgreen")
hist(median_price, col = "lightgrey")
hist(listings, col = "brown")
hist(months_inventory, col = "yellow")
par(mfrow = c(1, 1))

In order to compare the variability for columns
with different unit of measurement we must use the coefficient
of variation.
From a statistical point of view the variable
‘volume’ is the one whose values vary greatly relative
to the average.
If we see above the skewness functions (where I
executed the position, variability and shape indexes) and the
histograms, ‘volume’ is the variable
with the highest skewness.
Probabilities
# Probability for 'city' = 'Beaumont'
P_Beaumont = sum(city == 'Beaumont')/length(city)
P_Beaumont # P=1/4
## [1] 0.25
# Probability for 'month' = 'July' (7)
P_July = sum(month == 7)/length(month)
P_July # P=1/12
## [1] 0.08333333
# Probability for 'month' = 'December' (12) and 'year' = 2012
P_Dec_2012 = sum((month == 7) & (year == 2012))/nrow(df)
P_Dec_2012 # P=1/60
## [1] 0.01666667
New variables
# New variable 'mean_price' calculated by 'volume'/'sales'
volume_mln = volume * 10^6
df$mean_price = volume_mln/sales
# New variable 'listings_efficiency' calculated by 'sales'/'listings'. We can obtain how well listings have performed
df$listings_efficiency = round(sales/listings, digits = 3)
df$listings_efficiency_perc = paste((df$listings_efficiency*100), "%")
# Visualize the results of the new variables
hist(df$mean_price, col = "lightgreen",
main = "Histogram of mean_prices",
xlab = "Mean_prices")

max_val <- max(df$listings_efficiency, na.rm = TRUE)
idx_max <- which.max(df$listings_efficiency)
city_max <- df$city[idx_max]
month_year_max <- df$month_year[idx_max]
boxplot(df$listings_efficiency, col = "lightblue",
main = "Listings_efficiency Boxplot",
xlab = "Listings_efficiency",
horizontal = TRUE)
points(max_val, 1, col="red", pch=19, cex=1.5)
text(max_val - 0.067, 1.1, labels = paste0(round(max_val,2), " (", city_max, ", ", month_year_max, ")"), col="red")

summary(df$listings_efficiency)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05000 0.08975 0.10950 0.11876 0.13525 0.38700
From the histogram we can see that ‘mean_prices’ between
140k and 170k are the ones with the highest frequency.
From the boxplot we can see the distribution of
‘listings_efficiency’, calculated by ‘sales’/‘listings’.
The maximum value for ‘listings_efficency’ was
registrated in 07/2014 for Bryan-College Station where
there were 39 sales for every 100 listings.
Dplyr and GGplot2 libraries
library(dplyr)
##
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
##
## filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Statistical analysis with graphic representations
# Statistical analysis of 'volume' by cities with graphical representation. We can clearly see the cities with higher variability (Tyler and Bryan-College Station).
volume_per_city <- df %>%
group_by(city) %>%
summarise(mean_val_city = mean(volume),
dev_std_city = sd(volume)
) %>%
mutate(label = paste0(city, " (SD=", round(dev_std_city, 2), ")"))
volume_per_city
## # A tibble: 4 × 4
## city mean_val_city dev_std_city label
## <chr> <dbl> <dbl> <chr>
## 1 Beaumont 26.1 6.97 Beaumont (SD=6.97)
## 2 Bryan-College Station 38.2 17.2 Bryan-College Station (SD=17…
## 3 Tyler 45.8 13.1 Tyler (SD=13.11)
## 4 Wichita Falls 13.9 3.24 Wichita Falls (SD=3.24)
ggplot(df, aes(x = volume, color = city, fill = city)) +
geom_density(alpha = 0.3) +
geom_vline(data = volume_per_city, aes(xintercept = mean_val_city, color = city),
linetype = "dashed", size = 1) +
scale_color_manual(
values = scales::hue_pal()(nrow(volume_per_city)),
labels = volume_per_city$label
) +
labs(x = "Volume", y = "Density",
title = "Density curve for cities with highlighting means") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Statistical analysis of 'sales' by years with graphical representation. We can see the variability between the years and the trend over time; higher variability and sales (in average) for 2013 and 2104.
sales_per_year <- df %>%
group_by(year) %>%
summarise(mean_val_year = mean(sales),
dev_std_year = sd(sales))
sales_per_year
## # A tibble: 5 × 3
## year mean_val_year dev_std_year
## <int> <dbl> <dbl>
## 1 2010 169. 60.5
## 2 2011 164. 63.9
## 3 2012 186. 70.9
## 4 2013 212. 84.0
## 5 2014 231. 95.5
ggplot(df, aes(x=year, y=sales)) +
geom_jitter(width=0.2, alpha=0.5) +
stat_summary(fun=mean, geom="point", color="red", size=3) +
labs(title="Volume per città con punti e media")

# Statistical analysis of 'listings' by months with graphical representation. We can see the 4° month as the one with the highest mean and the 7° with the highest SD.
listings_per_month <- df %>%
group_by(month) %>%
summarise(mean_val_month = mean(listings),
dev_std_month = sd(listings))
listings_per_month
## # A tibble: 12 × 3
## month mean_val_month dev_std_month
## <int> <dbl> <dbl>
## 1 1 1647. 705.
## 2 2 1692. 711.
## 3 3 1757. 727.
## 4 4 1826. 770.
## 5 5 1824. 790.
## 6 6 1833. 812.
## 7 7 1821. 827.
## 8 8 1786. 816.
## 9 9 1749. 803.
## 10 10 1710. 779.
## 11 11 1653. 741.
## 12 12 1558. 693.
boxplot(listings~month, col = "lightblue",
main = "Boxplot of listings over months",
xlab = "Month", ylab = "Listings")

# Boxplot to compare 'median_price' by city. Bryan-College Station has the highest median prices, while Wichita Falls has the lowest median prices.
ggplot(df, aes(x=city, y=median_price)) +
geom_boxplot() +
labs(x="City", y="Median_price",
title="Boxplot of median_price by city")

# Barplots: 'sales' by 'month' and 'sales' by 'city'. The highest amount of sales are between May and August and for Tyler, while the lowest sales are in January and for Wichita Falls. I also created a stacked barplot to see the 'sales' bu 'city' and 'month' (Tyler and Bryan-College Station have the highest values).
ggplot(df, aes(x=month, y=sales)) +
geom_col(fill="blue") +
labs(x="Month", y="Sales",
title="Barplot of sales by month") +
scale_x_continuous(
breaks = 1:12,
labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
) +
theme_minimal()

ggplot(df, aes(x=month, y=sales, fill=city)) +
geom_bar(stat="identity") +
labs(x="Month", y="Sales",
title="Stacked barplot of sales by month and city") +
scale_x_continuous(
breaks = 1:12,
labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
) +
theme_minimal()

ggplot(df, aes(x=city, y=sales)) +
geom_col(fill="violet") +
labs(x="City", y="Sales",
title="Barplot of sales by city") +
theme_minimal()

# Linecharts for 'sales' trend over the years by 'city'. For every city the 2014 and 2013 where the years with highest sales.
ggplot(df, aes(x = month, y = sales, color=factor(year), group=year)) +
geom_line() +
geom_point() +
facet_wrap(~ city, nrow = 2, ncol = 2) +
labs(
title = "Monthly sales trend by year",
x = "Month",
y = "Sales"
) +
scale_x_continuous(
breaks = 1:12,
labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
) +
theme_minimal()

# Linecharts of 'volume' by 'year' (in average) for every 'city'. They all seem to have an upward trend. Tyler has the highest values, while Whicita Falls has a great drop in 2011.
df_Beaumont <- subset(df, city == "Beaumont")
df_agg_Beaumont <- aggregate(volume ~ year, data = df_Beaumont, FUN = mean)
plot(df_agg_Beaumont$year, df_agg_Beaumont$volume, type="l", col="blue", lwd=2,
main="Linechart of mean volume (mln) by year for Beaumont",
xlab="Year", ylab="Volume (mln)")

df_Bryan_College_Station <- subset(df, city == "Bryan-College Station")
df_agg_Bryan_College_Station <- aggregate(volume ~ year, data = df_Bryan_College_Station, FUN = mean)
plot(df_agg_Bryan_College_Station$year, df_agg_Bryan_College_Station$volume, type="l", col="red", lwd=2,
main="Linechart of mean volume (mln) by year for Bryan-College Station",
xlab="Year", ylab="Volume (mln)")

df_Tyler <- subset(df, city == "Tyler")
df_agg_Tyler <- aggregate(volume ~ year, data = df_Tyler, FUN = mean)
plot(df_agg_Tyler$year, df_agg_Tyler$volume, type="l", col="green", lwd=2,
main="Linechart of mean volume (mln) by year for Tyler",
xlab="Year", ylab="Volume (mln)")

df_Wichita_Falls <- subset(df, city == "Wichita Falls")
df_agg_Wichita_Falls <- aggregate(volume ~ year, data = df_Wichita_Falls, FUN = mean)
plot(df_agg_Wichita_Falls$year, df_agg_Wichita_Falls$volume, type="l", col="orange", lwd=2,
main="Linechart of mean volume (mln) by year for Wichita Falls",
xlab="Year", ylab="Volume (mln)")

————————–
CONCLUSIONS
————————–
From the historical trends,
volumes of sales (in mln) increased from 2010
to 2014 for every city, with Tyler having the highest values
and Wichita Falls the lowest values.
We can also highlight a fall in volumes for
Wichita Falls in 2011.
If we compare monthly every year for every city,
quantity of sales have the highest values in
2013 and 2014, with a peak in the central
months (May-Aug) for Bryan-College Station.
The efficacy of listings (that become sales) is
between 5% and 20% for almost every observation, with
few outliers.
We can highlight the efficacy of
39% for Bryan-College Station in July
2014. That is the maximum value
registered.
The highest classes of median price registered were
127k-138k and 148k-159k. These amounts
are nearer to the highest value of median price than to the lowest.