Import, Read, and Preview Data
library("readxl")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
buildings <- read_excel('tallestbuildings.xlsx')
buildings
## # A tibble: 100 x 9
## Rank `Building Name` City `Height (m)` `Height (ft)` Floors Completed
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 Burj Khalifa Duba… 828 2717 163 2010
## 2 2 Shanghai Tower Shan… 632 2073 128 2015
## 3 3 Makkah Royal C… Mecc… 601 1972 120 2012
## 4 4 One World Trad… New … 541. 1776 94 2014
## 5 5 TAIPEI 101 Taip… 508 1667 101 2004
## 6 6 Shanghai World… Shan… 492 1614 101 2008
## 7 7 International … Hong… 484 1588 108 2010
## 8 8 Petronas Twin … Kual… 452. 1483 88 1998
## 9 8 Petronas Twin … Kual… 452. 1483 88 1998
## 10 10 Zifeng Tower Nanj… 450 1476 66 2010
## # … with 90 more rows, and 2 more variables: Material <chr>, Use <chr>
Rename columns and clean up data
library(rattle) # need to install the package first
## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(magrittr) # need to install that package first
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
names(buildings) %<>% normVarNames()
buildings
## # A tibble: 100 x 9
## rank building_name city height_m height_ft floors completed material use
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 1 Burj Khalifa Duba… 828 2717 163 2010 steel/c… offi…
## 2 2 Shanghai Tower Shan… 632 2073 128 2015 composi… hote…
## 3 3 Makkah Royal … Mecc… 601 1972 120 2012 steel/c… othe…
## 4 4 One World Tra… New … 541. 1776 94 2014 composi… offi…
## 5 5 TAIPEI 101 Taip… 508 1667 101 2004 composi… offi…
## 6 6 Shanghai Worl… Shan… 492 1614 101 2008 composi… hote…
## 7 7 International… Hong… 484 1588 108 2010 composi… hote…
## 8 8 Petronas Twin… Kual… 452. 1483 88 1998 composi… offi…
## 9 8 Petronas Twin… Kual… 452. 1483 88 1998 composi… offi…
## 10 10 Zifeng Tower Nanj… 450 1476 66 2010 composi… hote…
## # … with 90 more rows
Group the buildings by City in descending order
Citycounts=buildings%>%
group_by(city)%>%
summarise(number=length(city))
arrange(Citycounts, desc(number))
## # A tibble: 38 x 2
## city number
## <chr> <int>
## 1 Dubai (AE) 18
## 2 New York City (US) 7
## 3 Chicago (US) 6
## 4 Guangzhou (CN) 6
## 5 Hong Kong (CN) 6
## 6 Abu Dhabi (AE) 4
## 7 Moscow (RU) 4
## 8 Shanghai (CN) 4
## 9 Shenzhen (CN) 4
## 10 Kuala Lumpur (MY) 3
## # … with 28 more rows
Group and rank means of city buildings and plot
Citycounts=buildings%>%
group_by(city)%>%
summarise(mean_height=mean(height_ft))
mean_buildings_sort <- arrange(Citycounts, desc(mean_height))
ggplot(data = mean_buildings_sort, mapping = aes(reorder(city, mean_height), mean_height)) + geom_bar(stat = 'identity') + coord_flip() + ggtitle('Mean Height of Tallest Buildings by City') + labs(y = 'Mean of Building Heights (ft)', x = 'City')

Create new columns by separating the city and country
library(countrycode)
buildings <- buildings %>%
separate(city, into = c('city', 'country'), sep = '[:punct:]')
## Warning: Expected 2 pieces. Additional pieces discarded in 100 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
Group the buildings by Country in descending order
# use the country code function
buildings$country = countrycode(buildings$country, "iso2c", "country.name")
Countrycounts=buildings%>%
group_by(country)%>%
summarise(number=length(country))
arrange(Countrycounts, desc(number))
## # A tibble: 13 x 2
## country number
## <chr> <int>
## 1 China 42
## 2 United Arab Emirates 22
## 3 United States 17
## 4 Russia 4
## 5 Malaysia 3
## 6 Saudi Arabia 3
## 7 Taiwan 2
## 8 Thailand 2
## 9 Australia 1
## 10 Kuwait 1
## 11 South Korea 1
## 12 United Kingdom 1
## 13 Vietnam 1
Group and rank means of country buildings and plot
Countrycounts=buildings%>%
group_by(country)%>%
summarise(mean_height=mean(height_ft))
mean_buildings_sort <- arrange(Countrycounts, desc(mean_height))
ggplot(data = mean_buildings_sort, mapping = aes(reorder(country, mean_height), mean_height)) + geom_bar(stat = 'identity') + coord_flip() + ggtitle('Mean Height of Tallest Buildings by Country') + labs(y = 'Mean of Building Heights (ft)', x = 'Country')

Color each bar for a country based on the number of buildings from this dataset that are present in that country
Countrycounts=buildings%>%
group_by(country)%>%
summarise(mean_height=mean(height_ft), count=length(country))
Countrycounts
## # A tibble: 13 x 3
## country mean_height count
## <chr> <dbl> <int>
## 1 Australia 1058 1
## 2 China 1167. 42
## 3 Kuwait 1354 1
## 4 Malaysia 1328. 3
## 5 Russia 1069. 4
## 6 Saudi Arabia 1325. 3
## 7 South Korea 1001 1
## 8 Taiwan 1404. 2
## 9 Thailand 1014 2
## 10 United Arab Emirates 1193. 22
## 11 United Kingdom 1004 1
## 12 United States 1168. 17
## 13 Vietnam 1078 1
ggplot(data = Countrycounts, mapping = aes(reorder(country, mean_height), mean_height, fill = as.factor(count))) + geom_bar(stat = 'identity') + coord_flip() + ggtitle('Mean Height of Tallest Buildings by Country') + labs(y = 'Mean of Building Heights (ft)', x = 'Country') + guides(fill=guide_legend(title="Building Count"))

Split use of buildings, pivot longer, and comparing mean heights for building purposes
buildings <- buildings %>%
separate(use, into = c('use1', 'use2', 'use3'), sep = ' / ')
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 89 rows [2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...].
BuildingPurpose=buildings%>%
# Extra non-essential step to visualize df progression and split
group_by(building_name)%>%
# Extra non-essential step to visualize df progression and split
summarise(use1, use2, use3, height_ft)%>%
pivot_longer(cols = c('use1', 'use2', 'use3'), values_to = "use", values_drop_na = TRUE)
BuildingPurpose%>%
group_by(use)%>%
summarise(mean_height=mean(height_ft))
## # A tibble: 6 x 2
## use mean_height
## <chr> <dbl>
## 1 hotel 1210.
## 2 office 1191.
## 3 other 1972
## 4 residential 1150.
## 5 retail 1117
## 6 serviced apartments 1118