DS Lab #1

Karol Flisikowski – student: s188026 Yifei Liu

2021-03-24

Data wrangling

As you can see not all formats of our variables are adjusted. We need to prepare the appropriate formats of our variables according to their measurement scales and future usage.

mieszkania$district<-as.factor(mieszkania$district)
mieszkania$building_type<-as.factor(mieszkania$building_type)
mieszkania$rooms<-factor(mieszkania$rooms,ordered=TRUE)
attach(mieszkania)
mieszkania$price_PLN<-as.numeric(mieszkania$price_PLN)
mieszkania$price_EUR<-as.numeric(mieszkania$price_EUR)

Frequency table

#PLN
Range1<-cut(price_PLN,seq(350000,1350000,by=100000))
table1<-table(Range1)
transform(table1,Prop_Freq=prop.table(Freq),Cum_Freq=cumsum(Freq))
##                 Range1 Freq Prop_Freq Cum_Freq
## 1    (3.5e+05,4.5e+05]    9     0.045        9
## 2    (4.5e+05,5.5e+05]   21     0.105       30
## 3    (5.5e+05,6.5e+05]   33     0.165       63
## 4    (6.5e+05,7.5e+05]   36     0.180       99
## 5    (7.5e+05,8.5e+05]   31     0.155      130
## 6    (8.5e+05,9.5e+05]   36     0.180      166
## 7   (9.5e+05,1.05e+06]   21     0.105      187
## 8  (1.05e+06,1.15e+06]   10     0.050      197
## 9  (1.15e+06,1.25e+06]    2     0.010      199
## 10 (1.25e+06,1.35e+06]    1     0.005      200
#EUR
Range2<-cut(price_EUR,seq(80000,320000,by=40000))
table2<-table(Range2)
transform(table2,Prop_Freq=prop.table(Freq),Cum_Freq=cumsum(Freq))
##              Range2 Freq Prop_Freq Cum_Freq
## 1   (8e+04,1.2e+05]   20     0.100       20
## 2 (1.2e+05,1.6e+05]   59     0.295       79
## 3   (1.6e+05,2e+05]   54     0.270      133
## 4   (2e+05,2.4e+05]   52     0.260      185
## 5 (2.4e+05,2.8e+05]   13     0.065      198
## 6 (2.8e+05,3.2e+05]    2     0.010      200

TAI

##        # classes  Goodness of fit Tabular accuracy 
##       10.0000000        0.9780872        0.8508467
##        # classes  Goodness of fit Tabular accuracy 
##        6.0000000        0.9348720        0.7429525

Basic plots

In this section we should present our data using basic (pre-installed with R) graphics. Choose the most appropriate plots according to the scale of chosen variables. Investigate the heterogeneity of the distribution presenting data by groups (i.e. by district, building type etc.). Do not forget about main titles, labels and legend. Read more about graphical parameters here.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

ggplot2 plots

Using facets

Faceting generates small multiples each showing a different subset of the data. Small multiples are a powerful tool for exploratory data analysis: you can rapidly compare patterns in different parts of the data and see whether they are the same or different. Read more about facets here.

Descriptive statistics #1

Before automatically reporting the full summary table of descriptive statistics, this time your goal is to measure the central tendency of the distribution of prices. Compare mean, median and mode together with positional measures - quantiles - by districts and building types or no. of rooms per apartment.

## [1] "PLN"
## [1] 760035
## [1] 755719.5
## [1] 186099.8
## [1] 34633125960
##      75% 
## 282686.5
## [1] 359769
## [1] 1277691
##        0%        5%       25%       50%       75%       95%      100% 
##  359769.0  477175.4  619073.8  755719.5  901760.2 1054250.8 1277691.0
## [1] "EUR"
## [1] 175934
## [1] 174935
## [1] 43078.62
## [1] 1855767906
##      75% 
## 65436.25
## [1] 83280
## [1] 295762
##       0%       5%      25%      50%      75%      95%     100% 
##  83280.0 110457.8 143304.2 174935.0 208740.5 244039.9 295762.0

Summary tables with ‘kable’

Using kable and kableextra packages we can easily create summary tables with graphics and/or statistics.

#PLN
mieszkania_list <- split(mieszkania$price_PLN, mieszkania$rooms)
inline_plot <- data.frame(rooms = c(1, 2, 3, 4), boxplot = "", histogram = "",
                          line1 = "", line2 = "", points1 = "")
inline_plot %>%
  kbl(booktabs = TRUE) %>%
  kable_paper(full_width = FALSE) %>%
  column_spec(2, image = spec_boxplot(mieszkania_list)) %>%
  column_spec(3, image = spec_hist(mieszkania_list)) %>%
  column_spec(4, image = spec_plot(mieszkania_list, same_lim = TRUE)) %>%
  column_spec(5, image = spec_plot(mieszkania_list, same_lim = FALSE)) %>%
  column_spec(6, image = spec_plot(mieszkania_list, type = "p"))
rooms boxplot histogram line1 line2 points1
1
2
3
4
#EUR
mieszkania_list <- split(mieszkania$price_EUR, mieszkania$rooms)
inline_plot <- data.frame(rooms = c(1, 2, 3, 4), boxplot = "", histogram = "",
                          line1 = "", line2 = "", points1 = "")
inline_plot %>%
  kbl(booktabs = TRUE) %>%
  kable_paper(full_width = FALSE) %>%
  column_spec(2, image = spec_boxplot(mieszkania_list)) %>%
  column_spec(3, image = spec_hist(mieszkania_list)) %>%
  column_spec(4, image = spec_plot(mieszkania_list, same_lim = TRUE)) %>%
  column_spec(5, image = spec_plot(mieszkania_list, same_lim = FALSE)) %>%
  column_spec(6, image = spec_plot(mieszkania_list, type = "p"))
rooms boxplot histogram line1 line2 points1
1
2
3
4

Ok, now we will finally summarize basic central tendency measures for prices by districts/building types using kable packages. You can customize your final report. See some hints here.

library(psych)
#PLN and district
summary1 <-
  list("price PLN" =
         list(
           "price PLN_min" = ~ min(price_PLN),
           "price PLN_max" = ~ max(price_PLN),
           "price PLN_mean(sd)" = ~ qwraps2::mean_sd(price_PLN),
           "Q1" = ~ quantile(price_PLN,0.25),
           "Median" = ~ median(price_PLN),
           "Q3" = ~ quantile(price_PLN,0.75),
           "IQR" = ~ iqr(price_PLN),
           "Var %" = ~ round(sd(price_PLN)/mean(price_PLN),1),
           "IQR Var %" = ~ round(iqr(price_PLN)/median(price_PLN),1),
           "Skewness" = ~  round(skew(price_PLN),1),
           "Kurtosis" = ~  round(kurtosis(price_PLN),1)
           )
       )
summary_table(mieszkania, summary1, by=c("district"))%>%
  kbl(digits = 2,
    caption="Table 1. prices_PLN and district",
    col.names = c('Biskupin', 'Krzyki', 'Srodmiescie'))%>%
  kable_classic(full_width = F, html_font = "Cambria")%>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Table 1. prices_PLN and district
Biskupin Krzyki Srodmiescie
price PLN_min 519652 359769 448196
price PLN_max 1277691 1090444 1062054
price PLN_mean(sd) 818,614.06 \(\pm\) 175,597.94 726,507.20 \(\pm\) 195,015.45 739,339.70 \(\pm\) 171,428.11
Q1 676751 600180.5 592287.75
Median 817736 716726 727477.5
Q3 926474 876306.5 870752.5
IQR 249723 276126 278464.75
Var % 0.2 0.3 0.2
IQR Var % 0.3 0.4 0.4
Skewness 0.3 0.1 0.1
Kurtosis -0.3 -1 -1.1
#EUR and building type
summary2 <-
  list("price EUR" =
         list(
           "price EUR_min" = ~ min(price_EUR),
           "price EUR_max" = ~ max(price_EUR),
           "price EUR_mean" = ~ qwraps2::mean_sd(price_EUR),
           "Q1" = ~ quantile(price_EUR,0.25),
           "Median" = ~ round(median(price_EUR),2),
           "Q3" = ~ quantile(price_EUR,0.75),
           "IQR" = ~ iqr(price_EUR),
           "Var %" = ~ round(sd(price_EUR)/mean(price_EUR),1),
           "IQR Var %" = ~ round(iqr(price_EUR)/median(price_EUR),1),
           "Skewness" = ~  round(skew(price_EUR),1),
           "Kurtosis" = ~  round(kurtosis(price_EUR),1)
           )
       )
summary_table(mieszkania, summary2,by=c("building_type"))%>%
  kbl(digits = 2,
    caption="Table 1. prices_EUR and building type",
    col.names = c('kamienica', 'niski blok', 'wiezowiec'))%>%
  kable_classic(full_width = F, html_font = "Cambria")%>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Table 1. prices_EUR and building type
kamienica niski blok wiezowiec
price EUR_min 96258 114905 83280
price EUR_max 284918 295762 252418
price EUR_mean 178,317.70 \(\pm\) 42,682.42 188,790.90 \(\pm\) 40,831.08 163,363.18 \(\pm\) 42,246.11
Q1 149944 160399.5 128657.5
Median 185346 187013 157107.5
Q3 207450 217558.5 201563
IQR 57506 57159 72905.5
Var % 0.2 0.2 0.3
IQR Var % 0.3 0.3 0.5
Skewness 0 0.2 0.2
Kurtosis -0.6 -0.4 -1
#facet: price per room of different buildings in different district
#different room size use different colour then will find the same size may have similar price per room
plot <- ggplot(mieszkania, aes(x=price_PLN, y=price_PLN/as.numeric(rooms))) + 
  geom_point(shape=21,colour=rooms,stroke=0.25,alpha=0.8,size=as.numeric(rooms))
plot + facet_grid(district~building_type, scales = "free", space = "free")