options(stringsAsFactors = F)
gdp <- read.csv("data/gdp-metro/gdp-metro.csv", header = T)
fips1 <- read.csv("data/gdp-metro/fips-dictionary.csv", header = T)
fips1 <- fips1[1:364, ]
latitude <- read.csv("data/city-location/latlng.csv", header = T)
# cleaning fips_dictionary, so we want to seperate the city and the sate and
# then remove the (MSA) from the column.
head(fips1)
## fips Metropolitan.Area
## 1 998 U.S. Metropolitan Portion
## 2 10180 Abilene, TX (MSA)
## 3 10420 Akron, OH (MSA)
## 4 10500 Albany, GA (MSA)
## 5 10580 Albany-Schenectady-Troy, NY (MSA)
## 6 10740 Albuquerque, NM (MSA)
tail(fips1)
## fips Metropolitan.Area
## 359 49340 Worcester, MA (MSA)
## 360 49420 Yakima, WA (MSA)
## 361 49620 York-Hanover, PA (MSA)
## 362 49660 Youngstown-Warren-Boardman, OH-PA (MSA)
## 363 49700 Yuba City, CA (MSA)
## 364 49740 Yuma, AZ (MSA)
library(reshape2)
fips <- fips1[, -2]
fips <- as.data.frame(fips)
split_fips <- colsplit(fips1$Metropolitan.Area, pattern = ", ", names = c("msa",
"state"))[, -3]
fips <- cbind(fips, split_fips)
fips$state <- substr(fips$state, 1, 2)
for (i in 1:nrow(fips)) {
fips$city[i] <- strsplit(as.character(fips$msa[i]), "-")[[1]]
}
fips <- fips[-1, c(1, 4, 3)]
names(fips) <- c("msa_fips", "city", "state")
write.table(fips, "trimmed-fips-dictionary.csv", sep = ",", row = F)
head(fips)
## msa_fips city state
## 2 10180 Abilene TX
## 3 10420 Akron OH
## 4 10500 Albany GA
## 5 10580 Albany NY
## 6 10740 Albuquerque NM
## 7 10780 Alexandria LA
tail(fips)
## msa_fips city state
## 359 49340 Worcester MA
## 360 49420 Yakima WA
## 361 49620 York PA
## 362 49660 Youngstown OH
## 363 49700 Yuba City CA
## 364 49740 Yuma AZ
# calculating role of tourism and growth in economy
library(plyr)
gdpanalysis <- function(df) {
total <- df$gdp[df$indust == 1]
# tourism economy is a combination of amusement, accomodation and leisure
leisure <- df$gdp[df$indust == 105] #leisure
accomodations <- df$gdp[df$indust == 74] #accomodation
entertainment <- df$gdp[df$indust == 71] #amusement
# growth is a combination of construction and real estate
real_estate <- df$gdp[df$indust == 55]
construction <- df$gdp[df$indust == 11]
c(total = total, leisure = leisure, accomodations = accomodations, entertainment = entertainment,
real_estate = real_estate, construction = construction)
}
# analyzing the gdp by location for each year
new_gdp <- ddply(gdp, .(fips, year), gdpanalysis)
# adding city and state information to new_gdp
fips$fips <- as.numeric(fips$msa_fips)
gdp_w_cs <- merge(new_gdp, fips, by.x = "fips", by.y = "fips", all.x = T)
# adding latitude and longitude information to gdp
gdp_final <- merge(gdp_w_cs, latitude, by.x = c("city", "state"), by.y = c("city",
"state"), all.x = T)
write.table(gdp_final, "tourism-and-growth.csv", sep = ",", row = F)
The plots are:
library(ggplot2)
# can also sum these and find out which state has good contribution to
# tourism. tourism=sum(leisure,accomodation,entertainment)
states <- map_data("state")
p <- ggplot() + geom_polygon(data = states, aes(x = long, y = lat, fill = "grey",
group = group), colour = "white")
p <- p + geom_point(data = gdp_final[gdp_final$year == 2008, ], aes(x = gdp_final$longitude,
y = gdp_final$latitude, colour = gdp_final$leisure/gdp_final$total, size = gdp_final$leisure))
p + scale_colour_gradient("percent of GDP", trans = "sqrt") + scale_size("total dollars",
trans = "log10")
q <- ggplot() + geom_polygon(data = states, aes(x = long, y = lat, fill = "grey",
group = group), colour = "white")
q <- q + geom_point(data = gdp_final[gdp_final$year == 2008, ], aes(x = gdp_final$longitude,
y = gdp_final$latitude, colour = gdp_final$accomodations/gdp_final$total,
size = gdp_final$accomodations))
q + scale_colour_gradient("percent of GDP", trans = "log") + scale_size("total dollars",
trans = "log10")
r <- ggplot(main = "Cities by amount of arts, entertainment, and recreation dollars spent in 2004") +
geom_polygon(data = states, aes(x = long, y = lat, fill = "grey", group = group),
colour = "white")
r <- r + geom_point(data = gdp_final[gdp_final$year == 2008, ], aes(x = gdp_final$longitude,
y = gdp_final$latitude, colour = gdp_final$entertainment/gdp_final$total,
size = gdp_final$entertainment))
r + scale_colour_gradient("percent of GDP", trans = "log") + scale_size("total dollars",
trans = "log10")
# growth
s <- ggplot() + geom_polygon(data = states, aes(x = long, y = lat, fill = "grey",
group = group), colour = "white")
s <- s + geom_point(data = gdp_final[gdp_final$year == 2004, ], aes(x = gdp_final$longitude,
y = gdp_final$latitude, colour = gdp_final$real_estate/gdp_final$total,
size = gdp_final$real_estate, main = "Cities by amount of Real Estate dollars spent 2004"))
s + scale_colour_gradient("percent of GDP") + scale_size("total dollars", trans = "log10")
t <- ggplot() + geom_polygon(data = states, aes(x = long, y = lat, fill = "grey",
group = group), colour = "white")
t <- t + geom_point(data = gdp_final[gdp_final$year == 2004, ], aes(x = gdp_final$longitude,
y = gdp_final$latitude, colour = gdp_final$construction/gdp_final$total,
size = gdp_final$construction, main = "Cities by amount of construction dollars spent 2004"))
t + scale_colour_gradient("percent of GDP") + scale_size("total dollars", trans = "log10")