Import, Read, and Preview Data

library("readxl")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
buildings <- read_excel('tallestbuildings.xlsx')
buildings
## # A tibble: 100 x 9
##     Rank `Building Name` City  `Height (m)` `Height (ft)` Floors Completed
##    <dbl> <chr>           <chr>        <dbl>         <dbl>  <dbl>     <dbl>
##  1     1 Burj Khalifa    Duba…         828           2717    163      2010
##  2     2 Shanghai Tower  Shan…         632           2073    128      2015
##  3     3 Makkah Royal C… Mecc…         601           1972    120      2012
##  4     4 One World Trad… New …         541.          1776     94      2014
##  5     5 TAIPEI 101      Taip…         508           1667    101      2004
##  6     6 Shanghai World… Shan…         492           1614    101      2008
##  7     7 International … Hong…         484           1588    108      2010
##  8     8 Petronas Twin … Kual…         452.          1483     88      1998
##  9     8 Petronas Twin … Kual…         452.          1483     88      1998
## 10    10 Zifeng Tower    Nanj…         450           1476     66      2010
## # … with 90 more rows, and 2 more variables: Material <chr>, Use <chr>

Rename columns and clean up data

library(rattle) # need to install the package first
## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(magrittr) # need to install that package first
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
names(buildings) %<>% normVarNames()
buildings
## # A tibble: 100 x 9
##     rank building_name  city  height_m height_ft floors completed material use  
##    <dbl> <chr>          <chr>    <dbl>     <dbl>  <dbl>     <dbl> <chr>    <chr>
##  1     1 Burj Khalifa   Duba…     828       2717    163      2010 steel/c… offi…
##  2     2 Shanghai Tower Shan…     632       2073    128      2015 composi… hote…
##  3     3 Makkah Royal … Mecc…     601       1972    120      2012 steel/c… othe…
##  4     4 One World Tra… New …     541.      1776     94      2014 composi… offi…
##  5     5 TAIPEI 101     Taip…     508       1667    101      2004 composi… offi…
##  6     6 Shanghai Worl… Shan…     492       1614    101      2008 composi… hote…
##  7     7 International… Hong…     484       1588    108      2010 composi… hote…
##  8     8 Petronas Twin… Kual…     452.      1483     88      1998 composi… offi…
##  9     8 Petronas Twin… Kual…     452.      1483     88      1998 composi… offi…
## 10    10 Zifeng Tower   Nanj…     450       1476     66      2010 composi… hote…
## # … with 90 more rows

Group the buildings by City in descending order

Citycounts=buildings%>%
  group_by(city)%>%
  summarise(number=length(city))
arrange(Citycounts, desc(number))
## # A tibble: 38 x 2
##    city               number
##    <chr>               <int>
##  1 Dubai (AE)             18
##  2 New York City (US)      7
##  3 Chicago (US)            6
##  4 Guangzhou (CN)          6
##  5 Hong Kong (CN)          6
##  6 Abu Dhabi (AE)          4
##  7 Moscow (RU)             4
##  8 Shanghai (CN)           4
##  9 Shenzhen (CN)           4
## 10 Kuala Lumpur (MY)       3
## # … with 28 more rows

Group and rank means of city buildings and plot

Citycounts=buildings%>%
  group_by(city)%>%
  summarise(mean_height=mean(height_ft))
mean_buildings_sort <- arrange(Citycounts, desc(mean_height))

ggplot(data = mean_buildings_sort, mapping = aes(reorder(city, mean_height), mean_height)) + geom_bar(stat = 'identity') + coord_flip() + ggtitle('Mean Height of Tallest Buildings by City') + labs(y = 'Mean of Building Heights (ft)', x = 'City')

Create new columns by separating the city and country

library(countrycode)
buildings <- buildings %>%
  separate(city, into = c('city', 'country'), sep = '[:punct:]')
## Warning: Expected 2 pieces. Additional pieces discarded in 100 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

Group the buildings by Country in descending order

# use the country code function
buildings$country = countrycode(buildings$country, "iso2c", "country.name")
Countrycounts=buildings%>%
  group_by(country)%>%
  summarise(number=length(country))
arrange(Countrycounts, desc(number))
## # A tibble: 13 x 2
##    country              number
##    <chr>                 <int>
##  1 China                    42
##  2 United Arab Emirates     22
##  3 United States            17
##  4 Russia                    4
##  5 Malaysia                  3
##  6 Saudi Arabia              3
##  7 Taiwan                    2
##  8 Thailand                  2
##  9 Australia                 1
## 10 Kuwait                    1
## 11 South Korea               1
## 12 United Kingdom            1
## 13 Vietnam                   1

Group and rank means of country buildings and plot

Countrycounts=buildings%>%
  group_by(country)%>%
  summarise(mean_height=mean(height_ft))
mean_buildings_sort <- arrange(Countrycounts, desc(mean_height))

ggplot(data = mean_buildings_sort, mapping = aes(reorder(country, mean_height), mean_height)) + geom_bar(stat = 'identity') + coord_flip() + ggtitle('Mean Height of Tallest Buildings by Country') + labs(y = 'Mean of Building Heights (ft)', x = 'Country')

Color each bar for a country based on the number of buildings from this dataset that are present in that country

Countrycounts=buildings%>%
  group_by(country)%>%
  summarise(mean_height=mean(height_ft), count=length(country))
Countrycounts
## # A tibble: 13 x 3
##    country              mean_height count
##    <chr>                      <dbl> <int>
##  1 Australia                  1058      1
##  2 China                      1167.    42
##  3 Kuwait                     1354      1
##  4 Malaysia                   1328.     3
##  5 Russia                     1069.     4
##  6 Saudi Arabia               1325.     3
##  7 South Korea                1001      1
##  8 Taiwan                     1404.     2
##  9 Thailand                   1014      2
## 10 United Arab Emirates       1193.    22
## 11 United Kingdom             1004      1
## 12 United States              1168.    17
## 13 Vietnam                    1078      1
ggplot(data = Countrycounts, mapping = aes(reorder(country, mean_height), mean_height, fill = as.factor(count))) + geom_bar(stat = 'identity') + coord_flip() + ggtitle('Mean Height of Tallest Buildings by Country') + labs(y = 'Mean of Building Heights (ft)', x = 'Country') + guides(fill=guide_legend(title="Building Count"))

Split use of buildings, pivot longer, and comparing mean heights for building purposes

buildings <- buildings %>%
  separate(use, into = c('use1', 'use2', 'use3'), sep = ' / ')
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 89 rows [2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...].
BuildingPurpose=buildings%>%
  # Extra non-essential step to visualize df progression and split
  group_by(building_name)%>%
  # Extra non-essential step to visualize df progression and split
  summarise(use1, use2, use3, height_ft)%>%
  pivot_longer(cols = c('use1', 'use2', 'use3'), values_to = "use", values_drop_na = TRUE)

BuildingPurpose%>%
  group_by(use)%>%
  summarise(mean_height=mean(height_ft))
## # A tibble: 6 x 2
##   use                 mean_height
##   <chr>                     <dbl>
## 1 hotel                     1210.
## 2 office                    1191.
## 3 other                     1972 
## 4 residential               1150.
## 5 retail                    1117 
## 6 serviced apartments       1118