Data retrieved from http://skyscrapercenter.com/buildings on 9/26/2015
Make country into different column - separate from city
suppressMessages(library(ggplot2))
suppressMessages(library(tidyr))
suppressMessages(library(dplyr))
suppressMessages(library(countrycode))
suppressMessages(library(printr))
bldg=read.csv("tallestbuildings.csv")
head(bldg)
1 |
Burj Khalifa |
Dubai (AE) |
828.0 |
2,717 |
163 |
2010 |
steel/concrete |
office / residential / hotel |
2 |
Makkah Royal Clock Tower Hotel |
Mecca (SA) |
601.0 |
1,972 |
120 |
2012 |
steel/concrete |
other / hotel |
3 |
One World Trade Center |
New York City (US) |
541.3 |
1,776 |
94 |
2014 |
composite |
office |
4 |
TAIPEI 101 |
Taipei (TW) |
508.0 |
1,667 |
101 |
2004 |
composite |
office |
5 |
Shanghai World Financial Center |
Shanghai (CN) |
492.0 |
1,614 |
101 |
2008 |
composite |
hotel / office |
6 |
International Commerce Centre |
Hong Kong (CN) |
484.0 |
1,588 |
108 |
2010 |
composite |
hotel / office |
str(bldg)
## 'data.frame': 100 obs. of 9 variables:
## $ Rank : int 1 2 3 4 5 6 7 7 9 10 ...
## $ Building.Name: Factor w/ 100 levels "23 Marina","4 World Trade Center",..: 15 53 65 79 73 40 68 69 100 96 ...
## $ City : Factor w/ 40 levels "Abu Dhabi (AE)",..: 9 23 28 35 32 13 19 19 27 6 ...
## $ HeightMeter : num 828 601 541 508 492 ...
## $ HeightFeet : Factor w/ 77 levels "1,001","1,002",..: 65 64 63 62 61 60 59 59 58 57 ...
## $ Floors : int 163 120 94 101 101 108 88 88 66 108 ...
## $ Completed : int 2010 2012 2014 2004 2008 2010 1998 1998 2010 1974 ...
## $ Material : Factor w/ 4 levels "composite","concrete",..: 4 4 1 1 1 1 1 1 1 3 ...
## $ Use : Factor w/ 13 levels "hotel","hotel / office",..: 7 8 6 6 2 2 6 6 2 6 ...
# Make HeightFeet numeric
bldg$HeightFeet=as.character(bldg$HeightFeet)
bldg$HeightFeet=gsub(",","",bldg$HeightFeet)
bldg$HeightFeet=as.numeric(bldg$HeightFeet)
# same as: bldg$HeightFeet=as.numeric(gsub(",","",as.character(bldg$HeightFeet)))
# http://astrostatistics.psu.edu/su07/R/html/base/html/regex.html
newbldg=bldg %>% separate(City, c("City","Country"),sep="[:punct:]")
# Number of buildings in different countries
table(newbldg$Country)
21 |
2 |
1 |
1 |
36 |
1 |
1 |
2 |
2 |
4 |
1 |
4 |
3 |
1 |
2 |
17 |
1 |
# change country codes to country names
newbldg$Country=countrycode(newbldg$Country,"iso2c","country.name")
#table(newbldg$Country)
## dplyr used here
newbldg %>% group_by(Country)%>%
summarise(Count=length(Country))%>%
ggplot(aes(x= reorder(Country,Count),y=Count)) +
geom_bar(stat="identity") +xlab("")+coord_flip() +geom_text(aes(Country,Count+1, label=Count))

newbldg %>% group_by(Country)%>%
summarise(Mean.Height=round(mean(HeightFeet),2))%>%
ggplot(aes(x= reorder(Country,Mean.Height),y=Mean.Height)) +
geom_bar(stat="identity") +xlab("")+coord_flip() +geom_text(aes(Country,Mean.Height-100, label=Mean.Height),color="white")

newbldg %>% group_by(Country)%>%
summarise(Mean.Height=round(mean(HeightFeet),2),
Count=length(Country))%>%
ggplot(aes(x= reorder(Country,Mean.Height),y=Mean.Height,fill=as.factor(Count))) +
geom_bar(stat="identity") +xlab("")+coord_flip() +geom_text(aes(Country,Mean.Height+75, label=Mean.Height))+ scale_fill_discrete(name="Number of \nBuildings")

newbldg %>% group_by(City)%>%
summarise(Count=length(City))%>%
ggplot(aes(x= reorder(City,Count),y=Count)) +
geom_bar(stat="identity") +xlab("")+coord_flip() +scale_y_continuous(breaks=c(1,2,3,4,6,7,18))

newbldg %>% group_by(City)%>%
summarise(Mean.Height=round(mean(HeightFeet),2))%>%
ggplot(aes(x= reorder(City,Mean.Height),y=Mean.Height)) +
geom_point() +xlab("")+coord_flip()

utilbldg=newbldg %>% separate(Use, c("Use1","Use2","Use3"),sep="/")%>%
gather(numuses,Use,Use1:Use3)%>%
filter(Use!="")
head(utilbldg)
1 |
Burj Khalifa |
Dubai |
United Arab Emirates |
828.0 |
2717 |
163 |
2010 |
steel/concrete |
Use1 |
office |
2 |
Makkah Royal Clock Tower Hotel |
Mecca |
Saudi Arabia |
601.0 |
1972 |
120 |
2012 |
steel/concrete |
Use1 |
other |
3 |
One World Trade Center |
New York City |
United States |
541.3 |
1776 |
94 |
2014 |
composite |
Use1 |
office |
4 |
TAIPEI 101 |
Taipei |
Taiwan, Province of China |
508.0 |
1667 |
101 |
2004 |
composite |
Use1 |
office |
5 |
Shanghai World Financial Center |
Shanghai |
China |
492.0 |
1614 |
101 |
2008 |
composite |
Use1 |
hotel |
6 |
International Commerce Centre |
Hong Kong |
China |
484.0 |
1588 |
108 |
2010 |
composite |
Use1 |
hotel |
#http://stackoverflow.com/questions/2261079/how-to-trim-leading-and-trailing-whitespace-in-r
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
utilbldg$Use <- trim(utilbldg$Use)
utilbldg%>%group_by(Use)%>% summarise(MeanHeight=mean(HeightFeet))
hotel |
1190.100 |
office |
1166.222 |
other |
1972.000 |
residential |
1133.387 |
retail |
1072.667 |
serviced apartments |
1118.000 |
utilbldg %>% group_by(Use)%>%summarise(Count=length(Use))
hotel |
40 |
office |
72 |
other |
1 |
residential |
31 |
retail |
3 |
serviced apartments |
2 |
utilbldg%>%group_by(Use)%>% summarise(MeanHeight=mean(HeightFeet),Count=length(Use))%>%
ggplot(.,aes(x=Use,y=MeanHeight,fill=as.factor(Count)))+geom_bar(stat="identity")+geom_text(aes(Use,MeanHeight+40, label=round(MeanHeight,2)))+ scale_fill_discrete(name="Number of \nBuildings")+ylab("Mean Height, in feet")
