DS Labs

Data wrangling

As you can see not all formats of our variables are adjusted. We need to prepare the appropriate formats of our variables according to their measurement scales and future usage.

mieszkania$district<-as.factor(mieszkania$district)
mieszkania$building_type<-as.factor(mieszkania$building_type)
mieszkania$rooms<-factor(mieszkania$rooms,ordered=TRUE)
attach(mieszkania)
mieszkania$price_PLN<-as.numeric(mieszkania$price_PLN)
mieszkania$price_EUR<-as.numeric(mieszkania$price_EUR)

attach(mieszkania)

## The following objects are masked from mieszkania (pos = 3):
## 
##     building_type, district, price_EUR, price_PLN, rooms, size

Frequency table and TAI

In the first stage of our analysis we are going to group our data in the form of the simple frequency table.

I’ve decided to present the analyse by the distribution of apartaments’ size.

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

## $`x:`
##                 x label Freq Percent Valid Percent Cumulative Percent
##    Valid 15-20 m2         31    15.5          15.5               15.5
##          20-25 m2         13     6.5           6.5               22.0
##          25-30 m2          3     1.5           1.5               23.5
##          30-35 m2         14     7.0           7.0               30.5
##          35-40 m2         16     8.0           8.0               38.5
##          40-45 m2         28    14.0          14.0               52.5
##          45-50 m2          8     4.0           4.0               56.5
##          50-55 m2         18     9.0           9.0               65.5
##          55-60 m2         12     6.0           6.0               71.5
##          60-65 m2         20    10.0          10.0               81.5
##          65-70 m2         12     6.0           6.0               87.5
##          70-75 m2          7     3.5           3.5               91.0
##          75-80 m2          4     2.0           2.0               93.0
##          80-85 m2          9     4.5           4.5               97.5
##          85-90 m2          5     2.5           2.5              100.0
##             Total        200   100.0         100.0                   
##  Missing  <blank>          0     0.0                                 
##              <NA>          0     0.0                                 
##             Total        200   100.0

# TAI measure
tab1<-classIntervals(mieszkania$size,n=15,style="fixed",fixedBreaks=seq(15,90,by=5))
TAI = jenks.tests(tab1)["Tabular accuracy"]
print(TAI)

## Tabular accuracy 
##        0.9383746

TAI measure is greater than 0.93 so it’s big enough to accept the proposed design of the frequency table.

Basic plots

In this section we should present our data using basic (pre-installed with R) graphics. Choose the most appropriate plots according to the scale of chosen variables. Investigate the heterogeneity of the distribution presenting data by groups (i.e. by district, building type etc.). Do not forget about main titles, labels and legend. Read more about graphical parameters here.

hist(mieszkania$size, angle = 15, col = c("#eeeaaa",  "#dddbbb"), border = "black", main = "Histogram of apartaments' size", xlab = "apartament size [m2]")

plot(sort(filter(mieszkania, rooms == 4)$size), type="o", col="#b8a712", xlim=c(1,60) , ylim=c(0, max(sort(filter(mieszkania, rooms == 4)$size))), main = "Plot of apartaments' size by number of rooms", xlab = "number of apartaments", ylab = "size of apartament")
abline(h = min(sort(filter(mieszkania, rooms == 4)$size)), col="#b8a712", lwd=1, lty=2)
abline(h = max(sort(filter(mieszkania, rooms == 4)$size)), col="#b8a712", lwd=1, lty=2)
lines(sort(filter(mieszkania, rooms == 3)$size), type="o", col="dark green", pch=22, lty=2)
abline(h = min(sort(filter(mieszkania, rooms == 3)$size)), col="dark green", lwd=1, lty=2)
abline(h = max(sort(filter(mieszkania, rooms == 3)$size)), col="dark green", lwd=1, lty=2)
lines(sort(filter(mieszkania, rooms == 2)$size), type="o", col="red", pch=22, lty=2)
abline(h = min(sort(filter(mieszkania, rooms == 2)$size)), col="red", lwd=1, lty=2)
abline(h = max(sort(filter(mieszkania, rooms == 2)$size)), col="red", lwd=1, lty=2)
lines(sort(filter(mieszkania, rooms == 1)$size), type="o", col="blue", pch=22, lty=2)
abline(h = min(sort(filter(mieszkania, rooms == 1)$size)), col="blue", lwd=1, lty=2)
abline(h = max(sort(filter(mieszkania, rooms == 1)$size)), col="blue", lwd=1, lty=2)
legend(51, 23, legend=c("4 rooms", "3 rooms", "2 rooms", "1 room"), col=c("#b8a712", "dark green", "red", "blue"), lty=1, cex=0.8)

First plot (histogram) shows us that amount of apartaments with different size is not equal. 40-50 [m2] size apartments are the most numerous, however 15-20 [m2] and 60-70 [m2] are also popular. Apartaments above 70 [m2] are less common and also apartments in size between 20 and 30 [m2] are not popular. The reason of it can explain second plot. The 2-rooms apartments starts by ~30 [m2] and 1-room apartments are in maximum size around ~22 [m2]. It means that there is a gap between ~22-30 [m2] because apartaments with 2-rooms in that size would be very small and inconvenient and 1-room apartment with size over 22 [m2] would be too spacious so people prefer buy a bit bigger (in size) 2-rooms apartment rather than 1-room on size ~20-25 [m2].

n = length(mieszkania$size)
apartments = list()
apartments$with1room = length(filter(mieszkania, rooms == 1)$size)
apartments$with2room = length(filter(mieszkania, rooms == 2)$size)
apartments$with3room = length(filter(mieszkania, rooms == 3)$size)
apartments$with4room = length(filter(mieszkania, rooms == 4)$size)

x = c(apartments$with1room, apartments$with2room, apartments$with3room, apartments$with4room)
colors = c("#999999", "#bbbbbb", "#dddddd", "#ffffff")

percents = c(sprintf("%g%s",100 * apartments$with1room/n, "%"),
             sprintf("%g%s",100 * apartments$with2room/n, "%"),
             sprintf("%g%s",100 * apartments$with3room/n, "%"),
             sprintf("%g%s",100 * apartments$with4room/n, "%"))
labels = c("1 room", "2 room", "3 room", "4 room")

pie(c(apartments$with1room, apartments$with2room, apartments$with3room, apartments$with4room), labels = percents, cex = 1, col = colors, main = "Distribution of number of rooms")
legend("topright", labels, cex = 1, fill = colors)

And there is pie chart with old-stype black-white colors. I will present nicer one in next section with ggplot2 From this we can perceive that most common are 3-room apartments, however the difference between other apartments is not big.

ggplot2 plots

In this section we will present the same plots but with the use of ggplot2 and ggpubr packages.

text <- paste("Apartments in Wroclaw.",
              "Random sample of 200 apartments.",
               sep = " ")
text.p <- ggparagraph(text = text, face = "italic", size = 11, color = "black")
density.p <- ggdensity(mieszkania, x = "size", 
          fill = "rooms", palette = c("#333aaa", "orange", "green", "violet"), title = "Size [m2] density by number of rooms", ylab = FALSE, ggtheme=theme_bw(), linetype = "solid", add="mean", rug = TRUE,combine = TRUE, merge = FALSE)
ggarrange(density.p, text.p, 
          ncol = 1, nrow = 2,
          heights = c(1, 0.2))

text <- paste("Apartments in Wroclaw.",
              "Random sample of 200 apartments.",
               sep = " ")
text.p <- ggparagraph(text = text, face = "italic", size = 11, color = "black")
density.p <- ggdensity(mieszkania, x = "price_PLN", 
          fill = "rooms", palette = c("#333aaa", "orange", "green", "violet"), title = "Price of apartment [PLN] density by number of rooms", ylab = FALSE, ggtheme=theme_bw(), linetype = "solid", add="mean", rug = TRUE,combine = TRUE, merge = FALSE)
ggarrange(density.p, text.p, 
          ncol = 1, nrow = 2,
          heights = c(1, 0.2))

#max(filter(mieszkania, rooms == 4)["price_PLN"])
#max(filter(mieszkania, rooms == 4)["price_PLN"])

As we can see, 4-room apartments are usually the most expensive. However, the 3-room apartments are sometimes more expensive, than 4-room ones. There are even some cases, where 2-room apartments are in greater price, than 4-room apartments. It can be conlcuded that size and number of rooms are important attribute of apartment but not only and for example smaller apartments in better place, newer building etc. may cost more than bigger but older/in the worse district etc.

table = data.frame(type = c("single-room", "2 room", "3 room", "4 room"),
  amount = c(apartments$with1room, apartments$with2room, apartments$with3room, apartments$with4room),
  share = c(apartments$with1room/n, apartments$with2room/n, apartments$with3room/n, apartments$with4room/n),
  min_price = c(min(filter(mieszkania, rooms == 1)$price_PLN), min(filter(mieszkania, rooms == 2)$price_PLN), min(filter(mieszkania, rooms == 3)$price_PLN), min(filter(mieszkania, rooms == 4)$price_PLN)),
  max_price = c(max(filter(mieszkania, rooms == 1)$price_PLN), max(filter(mieszkania, rooms == 2)$price_PLN), max(filter(mieszkania, rooms == 3)$price_PLN), max(filter(mieszkania, rooms == 4)$price_PLN))
)


table.p = ggtexttable(table, rows = NULL,  theme = ttheme("lVioletWhite"))
text.p <- ggparagraph(text = "Prices in PLN (Polish zloty)", face = "italic", size = 11, color = "black")

ggarrange(table.p, text.p,  ncol = 1, nrow = 2, heights = c(0.1, 0.1))

^ The previous descipted situation presented also as a nice table with ggtexttable function.

Ggplot2 allows to show the average value of each group using the stat_summary() function. No more need to calculate your mean values before plotting!

In this nice plot, we can see the distribution of sizes by number of rooms and also by a district - in one plot. We can also notice here a blue dot, that represents a mean of size in particular district.

Using facets

As we can see on plot above, thanks to faucet we can clearly see data. Apartments in “kamienica” are usually in bigger size, than in “wiezowiec”.

theme_set(theme_bw())  

# Diverging Barcharts
ggplot(mieszkania, aes(x=`rooms`, y=size, label=rooms)) + 
  geom_bar(stat='identity', aes(color=size), width=.5)  +
  scale_fill_manual(name="size", 
                    labels = c("Above Average", "Below Average"), 
                    values = c("above"="#00ba38", "below"="#f8766d")) + 
  labs(subtitle="'", 
       title= "Sum of area of all apartments by number of rooms") + 
  coord_flip()

#sum(filter(mieszkania, rooms == 1)["size"])

As we can see, number of apartments with 4-rooms is smaller than others but the area is grather so they have the biggest share in this plot

Descriptive statistics #1

Before automatically reporting the full summary table of descriptive statistics, this time your goal is to measure the central tendency of the distribution of prices. Compare mean, median and mode together with positional measures - quantiles - by districts and building types or no. of rooms per apartment.

mean(price_PLN)
median(price_PLN)
sd(price_PLN) #standard deviation
var(price_PLN) #variance
coeff_var<-sd(price_PLN)/mean(price_PLN) #coefficient of variability %
coeff_var
IQR(price_PLN)# difference between quartiles =Q3-Q1 
sx<-IQR(price_PLN)/2  #interquartile deviation
coeff_varx<-sx/median(price_PLN) #IQR coefficient of variability %
coeff_varx
min(price_PLN)
max(price_PLN)
quantile(price_PLN,probs=c(0,0.1,0.25,0.5,0.75,0.95,1),na.rm=TRUE)

measure <- list()
measure$mean = mean(mieszkania$price_PLN)
measure$median = median(mieszkania$price_PLN)
measure$sd = sd(mieszkania$price_PLN)
measure$variance = var(mieszkania$price_PLN)
measure$coeff_var = sd(mieszkania$price_PLN)/mean(mieszkania$price_PLN)
measure$sx = IQR(mieszkania$price_PLN)/2
measure$coeff_varx = measure$sx/median(mieszkania$price_PLN)
measure$min = min(mieszkania$price_PLN)
measure$max = max(mieszkania$price_PLN)
measure$quentile = quantile(mieszkania$price_PLN, probs=c(0,0.1,0.25,0.5,0.75,0.95,1), na.rm=TRUE)

stable <- desc_statby(mieszkania, measure.var = "price_PLN", grps = "district")
stable <- stable[, c("district", "length", "mean", "sd")]
# Summary table plot, medium orange theme
stable.p <- ggtexttable(stable, rows = NULL, 
                        theme = ttheme("mGreen"))
xz = data.frame(
  measure = c("mean", "median", "standard deviation", "variance", "coefficient of variability", "interquartile deviation", "IQR coefficient of variability", "min", "max", "quantile (100%)"),
  value = c(measure$mean, measure$median, measure$sd, measure$variance, measure$coeff_var, measure$sx, measure$coeff_varx, measure$min, measure$max, measure$quentile["100%"]),
  "calculated by function" = c("mean(mieszkania$price_PLN)", "median(mieszkania$price_PLN)", "sd(mieszkania$price_PLN)", "var(mieszkania$price_PLN)", "sd(mieszkania$price_PLN)/mean(mieszkania$price_PLN)", "IQR(mieszkania$price_PLN)/2", "sx/median(price_PLN)", "min(mieszkania$price_PLN", "max(mieszkania$price_PLN)", "quantile(mieszkania$price_PLN)['100%']"))
xz.p = ggtexttable(xz, rows = NULL,  theme = ttheme("lCyanWhite"))
xz.p

Summary tables with ‘kable’

Using kable and kableextra packages we can easily create summary tables with graphics and/or statistics.

mieszkania_list <- split(mieszkania$size, mieszkania$rooms)
min_price = c(min(filter(mieszkania, rooms == 1)$price_PLN), min(filter(mieszkania, rooms == 2)$price_PLN), min(filter(mieszkania, rooms == 3)$price_PLN), min(filter(mieszkania, rooms == 4)$price_PLN))
max_price = c(max(filter(mieszkania, rooms == 1)$price_PLN), max(filter(mieszkania, rooms == 2)$price_PLN), max(filter(mieszkania, rooms == 3)$price_PLN), max(filter(mieszkania, rooms == 4)$price_PLN))
numbers_of_apartments = c(apartments$with1room, apartments$with2room, apartments$with3room, apartments$with4room)
inline_plot <- data.frame(rooms = c(1, 2, 3, 4), boxplot = "", histogram = "",
                          line1 = "", points1 = "", "amount" = numbers_of_apartments, "max_price" = max_price, "min_price" = min_price)
inline_plot %>%
  kbl(booktabs = TRUE, centering = FALSE, caption = "Size of apartments by number of rooms") %>%
  kable_material(full_width = TRUE) %>%
  row_spec(c(1,3), bold = F, color = "black", background = "#cccccc") %>%
  row_spec(c(2,4), bold = F, color = "black", background = "#dddbbb") %>%
  column_spec(c(6), bold = F, color = "black", background = spec_color(numbers_of_apartments, alpha = 0.85, begin = 0.8, end=0.65, na_color = "#222222")) %>%
  column_spec(2, image = spec_boxplot(mieszkania_list, col = "#222222")) %>%
  column_spec(3, image = spec_hist(mieszkania_list, col = "#222222")) %>%
  column_spec(4, image = spec_plot(mieszkania_list, same_lim = TRUE, col = "#222222")) %>%
  column_spec(5, image = spec_plot(mieszkania_list, type = "p", col = "#222222"))

Size of apartments by number of rooms
rooms	amount	max_price	min_price
1	44	657146	359769
2	50	888634	590286
3	58	965829	632770
4	48	1277691	736669

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

## The following object is masked from 'package:car':
## 
##     logit

## The following object is masked from 'package:qwraps2':
## 
##     logit

raport <-
  list("Price in PLN" =
       list("Min"       = ~ min(price_PLN),
            "Max"       = ~ max(price_PLN),
            "Q1"        = ~ quantile(price_PLN,0.25),
            "Median" = ~ round(median(price_PLN),2),
            "Q3"        = ~ quantile(price_PLN,0.75),
            "Mean" = ~ round(mean(price_PLN),2),
            "Sd" = ~ round(sd(price_PLN),2),
             "IQR" = ~ round(iqr(price_PLN),2),
            "Sx" = ~ round(iqr(price_PLN)/2,2),
            "Var %" = ~ round((sd(price_PLN)/mean(price_PLN)),2),
            "IQR Var %" = ~ round((iqr(price_PLN)/median(price_PLN)),2),
            "Skewness" = ~  round(skew(price_PLN),2),
             "Kurtosis" = ~  round(kurtosis(price_PLN),2)
            ))
tabela<-summary_table(mieszkania, summaries = raport, by = c("rooms"))

kbl(tabela,
  digits = 2,
  caption="Table 1. Apartments in Wroclaw - prices in PLN.",
  col.names = c('1 room', '2 rooms', '3 rooms', '4 rooms'))%>%
 kable_material(full_width = T)%>%
 kable_styling(bootstrap_options = c("striped", "hover"))

Table 1. Apartments in Wroclaw - prices in PLN.
	1 room	2 rooms	3 rooms	4 rooms
Min	359769.00	590286.00	632770.00	736669.00
Max	657146.00	888634.00	965829.00	1277691.00
Q1	479684.75	634757.25	769683.75	909371.50
Median	520507.00	677260.00	846303.50	964338.50
Q3	555024.75	717728.50	901078.75	1050976.75
Mean	515518.05	683567.70	833706.02	974809.96
Sd	66951.03	65072.66	86943.90	113819.21
IQR	75340.00	82971.25	131395.00	141605.25
Sx	37670.00	41485.62	65697.50	70802.62
Var %	0.13	0.10	0.10	0.12
IQR Var %	0.14	0.12	0.16	0.15
Skewness	-0.20	0.80	-0.42	0.33
Kurtosis	-0.38	0.48	-0.83	0.05