getwd()
## [1] "C:/Users/PC/Desktop/sara/master/Statistica descrittiva"
setwd("C:/Users/PC/Desktop")
dati = read.csv('realestate_texas.csv', sep=',')
dim(dati)
## [1] 240 8
N=dim(dati)[1]
attach(dati)
the dataset is composed of 240 objects and 8 variables
city: nominal qualitative variable;
year: ordinal qualitative variable;
month: nominal qualitative variable (cyclic);
sales: discrete quantitative variable;
volume: continuous quantitative variable;
median_price: continuous quantitative variable;
listings: discrete quantitative variable;
months_inventory: continuous quantitative variable;
moda = table(city)
kable(moda)
| city | Freq |
|---|---|
| Beaumont | 60 |
| Bryan-College Station | 60 |
| Tyler | 60 |
| Wichita Falls | 60 |
city has a four-modal distribution.
Frequency distribution: year and month:
freq_ass_y = table(year) # Absolute frequency: 48 for each year.
freq_rel_y = table(year)/N # Relative frequency: 0.2 (20%) for each year.
distr_freq_year = cbind(freq_ass_y,freq_rel_y)
kable(distr_freq_year)
| freq_ass_y | freq_rel_y | |
|---|---|---|
| 2010 | 48 | 0.2 |
| 2011 | 48 | 0.2 |
| 2012 | 48 | 0.2 |
| 2013 | 48 | 0.2 |
| 2014 | 48 | 0.2 |
freq_ass_m = table(month)
freq_rel_m = table(month)/N
distr_freq_month = round(cbind(freq_ass_m,freq_rel_m),2)
kable(distr_freq_month)
| freq_ass_m | freq_rel_m |
|---|---|
| 20 | 0.08 |
| 20 | 0.08 |
| 20 | 0.08 |
| 20 | 0.08 |
| 20 | 0.08 |
| 20 | 0.08 |
| 20 | 0.08 |
| 20 | 0.08 |
| 20 | 0.08 |
| 20 | 0.08 |
| 20 | 0.08 |
| 20 | 0.08 |
Calculating indices: I create a function for quantitative variables
index_calculation = function(x) {
c(mean = mean(x),
median = median(x),
devst = sd(x),
min = min(x),
max = max(x),
Q1 = quantile(x, 0.25),
Q3 = quantile(x, 0.75),
IQR = IQR(x),
kurtosis = kurtosis(x),
skewness = skewness(x)
)
}
I apply the function to the variables and create a dataset ‘stat_df’ to see the statistics.
var_quant = c('sales', 'volume', 'median_price', 'listings', 'months_inventory')
stat = sapply(dati[var_quant], index_calculation)
stat_df = as.data.frame(round(stat,2))
kable(stat_df)
| sales | volume | median_price | listings | months_inventory | |
|---|---|---|---|---|---|
| mean | 192.29 | 31.01 | 132665.42 | 1738.02 | 9.19 |
| median | 175.50 | 27.06 | 134500.00 | 1618.50 | 8.95 |
| devst | 79.65 | 16.65 | 22662.15 | 752.71 | 2.30 |
| min | 79.00 | 8.17 | 73800.00 | 743.00 | 3.40 |
| max | 423.00 | 83.55 | 180000.00 | 3296.00 | 14.90 |
| Q1.25% | 127.00 | 17.66 | 117300.00 | 1026.50 | 7.80 |
| Q3.75% | 247.00 | 40.89 | 150050.00 | 2056.00 | 10.95 |
| IQR | 120.00 | 23.23 | 32750.00 | 1029.50 | 3.15 |
| kurtosis | 2.69 | 3.18 | 2.38 | 2.21 | 2.83 |
| skewness | 0.72 | 0.88 | -0.36 | 0.65 | 0.04 |
Coefficient of variation (CV)
cv = round(sapply(dati[var_quant],
function(x){return(sd(x)/mean(x)*100)}),2)
kable(cv)
| x | |
|---|---|
| sales | 41.42 |
| volume | 53.71 |
| median_price | 17.08 |
| listings | 43.31 |
| months_inventory | 25.06 |
Volume has a higher coefficient of variation than the other variables, with a percentage of 53.71%. Volume has a negative and leptokurtic skewed distribution and a greater asymmetry than the other variables (skewness=88%)
dati$sales_cl = cut(dati$sales,#I used the cut function to split the sales data into specific classes
breaks = c(79,150,250,350,423),
include.lowest = TRUE, #to include the lower limit of the first class.
right = FALSE) # to make the interval closed on the left and open on the right.
Calculating frequencies:
ni = table(dati$sales_cl)
fi = round((ni/N),2)
Ni = cumsum(ni)
Fi = round((Ni/N),2)
cbind(ni,fi,Ni,Fi)
## ni fi Ni Fi
## [79,150) 89 0.37 89 0.37
## [150,250) 91 0.38 180 0.75
## [250,350) 48 0.20 228 0.95
## [350,423] 12 0.05 240 1.00
I combined these results into a dataframe that includes all the calculated frequencies.
distr_freq_sales_cl = as.data.frame((cbind(ni,fi,Ni,Fi)))
kable(distr_freq_sales_cl)
| ni | fi | Ni | Fi | |
|---|---|---|---|---|
| [79,150) | 89 | 0.37 | 89 | 0.37 |
| [150,250) | 91 | 0.38 | 180 | 0.75 |
| [250,350) | 48 | 0.20 | 228 | 0.95 |
| [350,423] | 12 | 0.05 | 240 | 1.00 |
Gini index for distr_freq_sales_cl
gini.index = function(x){
ni = table(x)
fi = ni/length(x)
fi2 = fi^2
J= length(table(x))
gini = 1-sum(fi2)
gini.norm = gini/((J-1)/J)
return(gini.norm)
}
round(gini.index(distr_freq_sales_cl),2)
## [1] 0.75
G’= 0.75 indicates a rather unequal distribution between classes
a bar chart showing the distribution of absolute frequencies for each sales class.
barplot(ni,
main = 'frequency distribution',
xlab = 'length classes',
ylab = 'absolute frequencies',
col = 'lightblue')
Sales are mainly concentrated in the first two classes: [79,150) and [150,250)
Gini index for the city variable
gini.index(city)
## [1] 1
G’ = 1 Maximum heterogeneity. This happens because the distribution of cities is equally divided (each has a relative frequency of 0.25)
probability_city = sum(dati$city == 'Beaumont')/nrow(dati)
kable(data.frame("Probability city:" = probability_city))
| Probability.city. |
|---|
| 0.25 |
The probability that the city is “Beaumont” is 25%.
probability_month = round(sum(month == '7')/nrow(dati),3)
kable(data.frame("Probability month:" = probability_month))
| Probability.month. |
|---|
| 0.083 |
The probability that the month is “7” is 8.3%.
probability_date = round(sum(dati$month == '12' & dati$year == '2012')/nrow(dati),3)
kable(data.frame("Probability date:" = probability_date))
| Probability.date. |
|---|
| 0.017 |
The probability that the month is “12” and the year is “2012” is 0.0167 (1.7%).
I created a dataframe called ‘prob’ with the events and probabilities:
prob = data.frame(
Evento = c("Beaumont", "Luglio", "Dicembre 2012"),
Probabilità = c(probability_city, probability_month, probability_date)
)
I created a new column that calculates the mean price of properties.
mean_price_c = mutate(
dati,
mean_price = round((volume/sales),2)) %>%
select(volume,sales,mean_price) %>% head(5)
kable(mean_price_c)
| volume | sales | mean_price |
|---|---|---|
| 14.162 | 83 | 0.17 |
| 17.690 | 108 | 0.16 |
| 28.701 | 182 | 0.16 |
| 26.819 | 200 | 0.13 |
| 28.833 | 202 | 0.14 |
I created a column that measures the efficiency listings of sales ads.
efficiency_listings_c = mutate(
dati,
efficiency_listings = round((sales/listings),2)) %>%
select(,sales,listings, efficiency_listings)%>% head(5)
kable(efficiency_listings_c)
| sales | listings | efficiency_listings |
|---|---|---|
| 83 | 1533 | 0.05 |
| 108 | 1586 | 0.07 |
| 182 | 1689 | 0.11 |
| 200 | 1708 | 0.12 |
| 202 | 1771 | 0.11 |
stat_by_year =dati %>%
group_by(year) %>%
summarise(
mean = round((mean(listings, na.rm = TRUE)),2), # Mean sales
sd = round((sd(listings, na.rm = TRUE)),2), # Standard deviation of sales.
min = min(listings), # Min value of sales.
max = max(listings) # Max value of sales.
)
kable(stat_by_year)
| year | mean | sd | min | max |
|---|---|---|---|---|
| 2010 | 1826.00 | 785.02 | 904 | 3296 |
| 2011 | 1849.65 | 780.38 | 844 | 3266 |
| 2012 | 1776.81 | 738.45 | 801 | 3072 |
| 2013 | 1677.60 | 743.52 | 743 | 2998 |
| 2014 | 1560.04 | 706.71 | 746 | 2875 |
Statistics by year, month and city
stat_tot = dati %>%
group_by(city, month, year) %>%
summarise(
mean_sales = mean(sales, na.rm = TRUE),
.groups = 'drop'
)
VIEWS WITH GGPLOT2
ggplot(dati, aes(x=city,y=median_price))+
geom_boxplot(fill="lightblue")+
labs(
title = 'Distribution of average house price among cities',
x = 'city',
y= 'median price'
) +
theme_light() +
theme(
plot.title = element_text(hjust = 0.5, face='bold'),
axis.title.x = element_text(face = 'italic'),
axis.title.y = element_text(face = 'italic')
)
The boxplot graph shows the median price on the y-axis and the cities on the x-axis. Points outside the boxplot are outliers, which are found in all cities except Tyler. Also, Wichita Falls has a lower median price than other cities.
ggplot(dati, aes(x=factor(year),y=volume, fill = city))+
geom_boxplot()+
labs(
title = 'distribution of the total value of sales between the various cities and between the various years',
x = 'year',
y= 'total sales',
fill = 'city'
) +
theme_light() +
theme(
plot.title = element_text(hjust = 0.5,face='bold'),
axis.title.x = element_text(face = 'italic'),
axis.title.y = element_text(face = 'italic')
)
The graph shows the total sales over the years in the 4 cities. For each year, indicated on the x-axis, the total value of sales for each city is represented by a boxplot of a different color. The highest sales seem to occur in the cities of Bryan-College Station and Tyler, especially in the years 2013-2014. For the city of Beaumont there are outliers in 2012.
ggplot(dati)+
geom_bar(aes(x=factor(month),y=volume, fill = city), stat = "identity")+
labs(
title = 'distribution of the total value of sales',
x = 'month',
y = 'total sales',
fill = 'city'
)+
theme_light() +
theme(
plot.title = element_text(hjust = 0.5,face='bold'),
axis.title.x = element_text(face = 'italic'),
axis.title.y = element_text(face = 'italic')
)
The graph represents the total sales for each city over the months. The months are shown on the x-axis, while the total sales are shown on the y-axis. The different colors represent the 4 cities. From the graph thus obtained we can conclude that greater sales accumulate in the summer months (5,6,7,8), especially for the cities of Bryan College station and Tyler
ggplot(dati)+
geom_bar(aes(x=factor(month),y=volume, fill = city),
stat = "identity", position = 'fill')+
labs(
title = 'normalized distribution of the total value of sales',
x = 'month',
y = 'total sales'
)+
theme_light() +
theme(
plot.title = element_text(hjust = 0.5,face='bold'),
axis.title.x = element_text(face = 'italic'),
axis.title.y = element_text(face = 'italic')
)
The normalized graph represents total sales for each city over the months.
date = as.Date(paste(dati$year, dati$month, '01', sep = '-'))
ggplot(dati, aes(x=date, y=sales, color=city))+
geom_line(linewidth=1)+
geom_point(size=2)+
scale_color_discrete(name = "city") +
labs(
title = 'time series of the total value of sales',
x ='date',
y = 'total sales'
)+
scale_x_date(date_breaks = '2 months',
date_labels ='%b',
expand= c(0.02, 0.02)
)+
theme_light()+
theme(
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = 'bottom',
panel.grid.minor = element_blank(),
strip.background = element_rect(fill = 'gray90'),
strip.text = element_text(face = 'bold')
) +
facet_wrap(~year, ncol = 3, scales = 'free_x')
The line charts show the total sales of the 4 cities over the years (2010-2014). The x-axis shows the months, while the y-axis shows the total sales for each city. Looking at the chart, we can conclude that the city of Tyler, compared to the other cities, has had the most sales since 2010 and continues to grow until 2014. On the contrary, the city with the least sales is Wichita Falls, where total sales remain constant from 2010 to 2014.