Capstone

library(stringr)
library(dplyr)
library(reshape2)

library(XML)
library(gsubfn)
library(data.table)
library(tidyr)
library(zoo)
library(lattice)
library(caret)
library(MASS)
library(splitstackshape)
library(ggplot2)
library(plotly)
library(plyr)

I begin first by loading each of the league datasets. These datasets were manually compiled and contains yearly information (winning pct, ticket prices, attendance…) for every team in the MLB, NHL, NBA and NFL from 1991:2016. I’ll clean up some of the variables as well as add in the salary cap for each league by year. I’ll then bind the leagues together to create one large dataset.

mlb <- read.csv("mlb.csv", header = T, stringsAsFactors = F)
nhl<- read.csv("nhl.csv", header = T, stringsAsFactors = F)
nba <- read.csv("nba.csv", header = T, stringsAsFactors = F)
nfl <- read.csv("nfl.csv", header = T, stringsAsFactors = F)

as.numeric.factor <- function(x) {as.numeric(levels(x))[x]}

nfl$Payroll <- str_trim(nfl$Payroll)
nfl$Payroll <- as.numeric(nfl$Payroll)

nfl$Salary.Cap <- as.numeric(nfl$Salary.Cap)
mlb$Salary.Cap <- as.numeric(mlb$Salary.Cap)
nba$Salary.Cap <- as.numeric(nba$Salary.Cap)
nhl$Salary.Cap <- as.numeric(nhl$Salary.Cap)


nfl$Salary.Cap[nfl$Year == 2009] <- 123000000
nfl$Salary.Cap[nfl$Year == 2008] <- 116000000
nfl$Salary.Cap[nfl$Year == 2007] <-109000000
nfl$Salary.Cap[nfl$Year == 2006] <-102000000
nfl$Salary.Cap[nfl$Year == 2005] <-85500000
nfl$Salary.Cap[nfl$Year == 2004] <-80582000
nfl$Salary.Cap[nfl$Year == 2003] <-75007000
nfl$Salary.Cap[nfl$Year == 2002] <-71101000
nfl$Salary.Cap[nfl$Year == 2001] <-67405000
nfl$Salary.Cap[nfl$Year == 2000] <-62172000
nfl$Salary.Cap[nfl$Year == 2010] <- NA
nfl$Salary.Cap[nfl$Year == 2011] <- 120000000
nfl$Salary.Cap[nfl$Year == 2012] <-120600000
nfl$Salary.Cap[nfl$Year == 2013] <-123000000
nfl$Salary.Cap[nfl$Year == 2014] <-133000000
nfl$Salary.Cap[nfl$Year == 2015] <-143280000
nfl$Salary.Cap[nfl$Year == 2016] <-155270000
nfl$Salary.Cap[nfl$Year == 1994] <-34608000
nfl$Salary.Cap[nfl$Year == 1995] <-37100000
nfl$Salary.Cap[nfl$Year == 1996] <-40753000
nfl$Salary.Cap[nfl$Year == 1997] <-41454000
nfl$Salary.Cap[nfl$Year == 1998] <-52388000
nfl$Salary.Cap[nfl$Year == 1999] <-57288000

nhl$Salary.Cap[nhl$Year == 2005] <-39000000
nhl$Salary.Cap[nhl$Year == 2006] <-44000000
nhl$Salary.Cap[nhl$Year == 2007] <-50300000
nhl$Salary.Cap[nhl$Year == 2008] <-56700000
nhl$Salary.Cap[nhl$Year == 2009] <-56800000
nhl$Salary.Cap[nhl$Year == 2010] <-59400000
nhl$Salary.Cap[nhl$Year == 2011] <-64300000
nhl$Salary.Cap[nhl$Year == 2012] <-60000000
nhl$Salary.Cap[nhl$Year == 2013] <-64300000
nhl$Salary.Cap[nhl$Year == 2014] <-69000000
nhl$Salary.Cap[nhl$Year == 2015] <-71400000
nhl$Salary.Cap[nhl$Year == 2016] <-73000000

nba$Salary.Cap[nba$Year == 1991] <-12500000
nba$Salary.Cap[nba$Year == 1992] <-14000000
nba$Salary.Cap[nba$Year == 1993] <-15175000
nba$Salary.Cap[nba$Year == 1994] <-15964000
nba$Salary.Cap[nba$Year == 1995] <-23000000
nba$Salary.Cap[nba$Year == 1996] <-24363000
nba$Salary.Cap[nba$Year == 1997] <-26900000
nba$Salary.Cap[nba$Year == 1998] <-30000000
nba$Salary.Cap[nba$Year == 1999] <-34000000
nba$Salary.Cap[nba$Year == 2000] <-35500000
nba$Salary.Cap[nba$Year == 2001] <-42500000
nba$Salary.Cap[nba$Year == 2002] <-40271000
nba$Salary.Cap[nba$Year == 2003] <-43840000
nba$Salary.Cap[nba$Year == 2004] <-43870000
nba$Salary.Cap[nba$Year == 2005] <-49500000
nba$Salary.Cap[nba$Year == 2006] <-53135000
nba$Salary.Cap[nba$Year == 2007] <-55630000
nba$Salary.Cap[nba$Year == 2008] <-58680000
nba$Salary.Cap[nba$Year == 2009] <-57700000
nba$Salary.Cap[nba$Year == 2010] <-58044000
nba$Salary.Cap[nba$Year == 2011] <-58044000
nba$Salary.Cap[nba$Year == 2012] <-58044000
nba$Salary.Cap[nba$Year == 2013] <-58679000
nba$Salary.Cap[nba$Year == 2014] <-63065000
nba$Salary.Cap[nba$Year == 2015] <-70000000
nba$Salary.Cap[nba$Year == 2016] <-94143000

mlb$Salary.Cap[mlb$Year == 2003] <-117000000
mlb$Salary.Cap[mlb$Year == 2004] <-120500000
mlb$Salary.Cap[mlb$Year == 2005] <-128000000
mlb$Salary.Cap[mlb$Year == 2006] <-136500000
mlb$Salary.Cap[mlb$Year == 2007] <-148000000
mlb$Salary.Cap[mlb$Year == 2008] <-155000000
mlb$Salary.Cap[mlb$Year == 2009] <-162000000
mlb$Salary.Cap[mlb$Year == 2010] <-170000000
mlb$Salary.Cap[mlb$Year == 2011] <-178000000
mlb$Salary.Cap[mlb$Year == 2012] <-178000000
mlb$Salary.Cap[mlb$Year == 2013] <-178000000
mlb$Salary.Cap[mlb$Year == 2014] <-189000000
mlb$Salary.Cap[mlb$Year == 2015] <-189000000
mlb$Salary.Cap[mlb$Year == 2016] <-189000000

four <- rbind(mlb[c(1:20)], nfl[c(1:20)], nba[c(1:20)], nhl[c(1:20)])

Now to attach demographic information to each city.

I will begin with population data that has been interpolated in Excel for the years in between census.

pop.cities <- read.csv("city_pop.csv", header = T)
pop.cities <- pop.cities[1:3]

majorfour <- merge(four, pop.cities, all.x = T, by = c("City", "Year"))
majorfour <- majorfour[,-15]
majorfour <- mutate(majorfour, att.pct = majorfour$Ave.Attendance/majorfour$Capacity)

##A mistake (will be found later)
majorfour$T.Attendance[majorfour$Year == 2006 & majorfour$Team == "Oakland Raiders"] <- 467964
majorfour$Ave.Attendance[majorfour$Year == 2006 & majorfour$Team == "Oakland Raiders"] <- 467964/8

head(majorfour)

##      City Year Tm.ABV                          Team      State League
## 1 Anaheim 1991    LAA Los Angeles Angels of Anaheim California    MLB
## 2 Anaheim 1992    LAA Los Angeles Angels of Anaheim California    MLB
## 3 Anaheim 1993    ANA                 Anaheim Ducks California    NHL
## 4 Anaheim 1993    LAA Los Angeles Angels of Anaheim California    MLB
## 5 Anaheim 1994    ANA                 Anaheim Ducks California    NHL
## 6 Anaheim 1994    LAA Los Angeles Angels of Anaheim California    MLB
##                      Venue Capacity T.Attendance Ave.Attendance Ave.Tick
## 1 Angel Stadium of Anaheim    64593      2416236          29830     8.76
## 2 Angel Stadium of Anaheim    64593      2065444          25499     8.02
## 3             Honda Center    17200       685848          16728       NA
## 4 Angel Stadium of Anaheim    64593      2057460          25401     8.02
## 5             Honda Center    17174       704134          17174    33.22
## 6 Angel Stadium of Anaheim    64593      1512622          24010     8.06
##    Wpct  Payroll Salary.Cap Median.Income Median.Age Ave.Temp TZ   Age
## 1 0.500 33060001         NA            NA         NA       NA NA 29.35
## 2 0.444 34749334         NA            NA         NA       NA NA 28.45
## 3 0.423  8700000         NA            NA         NA       NA NA    NA
## 4 0.438 28588334         NA            NA         NA       NA NA 28.05
## 5 0.385 13200000         NA            NA         NA       NA NA    NA
## 6 0.409 25156218         NA            NA         NA       NA NA 28.35
##      Pop   att.pct
## 1 270608 0.4618147
## 2 274612 0.3947641
## 3 278740 0.9725581
## 4 278740 0.3932469
## 5 281772 1.0000000
## 6 281772 0.3717121

Now to begin loading the demographic data pulled from the US (1990, 2000, 2010, 2015) and Canadian Census (1996, 2001, 2010, 2016). I begin with the American census, creating a list of the American cities:

city.frame <- as.data.frame(table(majorfour$City))
city.frame <- city.frame[1]
names(city.frame)[1] <- "City"

citylist <- as.character(city.frame$City)
citylist <- citylist[-c(6, 15, 27, 35, 40, 51,52,54)]

citylist[14]<- "GreenBay"
citylist[19]<- "KansasCity"
citylist[20]<- "LosAngeles"
citylist[26]<- "NewOrleans"
citylist[27]<- "NewYork"
citylist[30]<- "Oklahoma"
citylist[38]<- "SaltLakeCity"
citylist[39]<- "SanAntonio"
citylist[40]<- "SanDiego"
citylist[41]<- "SanFransisco"
citylist[42]<- "SanJose"
citylist[44]<- "StLouis"
citylist[45]<- "Tampa"
citylist[46]<- "Washington"
citylist <- citylist[-15]

The demographic data has been pulled from individual files from each city/demographic info/census. I begin by loading the data and binding the cities together to create full datasets of each demographic variables from each census.

The first example is the Age Demographic information from the 2000 US Census. Here I create a function that searches from a list of all the files and then creates dataframes from those .csv files. I am then able to call these dataframes to bind the df’s into one large dataframe. A small caveat is that the 2000 Census demographic data does not contain information on Nashville, so I have had to account for this:

citylist.nonash <- citylist[-24]

filenames.Age2000 <- sprintf("Age_2000_%s.csv", citylist.nonash)

for (i in filenames.Age2000){
  name <- gsub(".csv", "", i)
  name <- gsub("Age_2000_", "", name)
  name <- paste("Age", name, sep = ".")
  all_content <- readLines(i)
  skip_second <- all_content[-1]
  assign(name, read.csv(textConnection(skip_second), header = T))
}

dfnames <- sprintf("Age.%s", citylist.nonash)

df1 <- '"Age.Anaheim"      "Age.Atlanta"      "Age.Baltimore"    "Age.Boston"       "Age.Buffalo"     
[6] "Age.Charlotte"    "Age.Chicago"      "Age.Cincinnati"   "Age.Cleveland"    "Age.Columbus"    
[11] "Age.Dallas"       "Age.Denver"       "Age.Detroit"      "Age.GreenBay"     "Age.Houston"     
[16] "Age.Indianapolis" "Age.Jacksonville" "Age.KansasCity"   "Age.LosAngeles"   "Age.Memphis"     
[21] "Age.Miami"        "Age.Milwaukee"    "Age.Minneapolis"  "Age.NewOrleans"   "Age.NewYork"     
[26] "Age.Newark"       "Age.Oakland"      "Age.Oklahoma"     "Age.Orlando"      "Age.Philadelphia"
[31] "Age.Phoenix"      "Age.Pittsburgh"   "Age.Portland"     "Age.Raleigh"      "Age.Sacramento"  
[36] "Age.SaltLakeCity" "Age.SanAntonio"   "Age.SanDiego"     "Age.SanFransisco" "Age.SanJose"     
[41] "Age.Seattle"      "Age.StLouis"      "Age.Tampa"        "Age.Washington"' 

df1 <- gsub('"', "", df1)
df1 <- gsub("\\[", "", df1)
df1 <- gsub("\\]", "", df1)
df1 <- gsub("\\d", "", df1)
df1 <- gsub("\\s+", " ", df1)
df1 <- gsub("\\s", ", ", df1)

Age2000 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))

I can now repeat these steps with the rest of the demographic datasets. I will be re-using names to save some time. There is also a bit of cleaning up for each of the datasets:

###Age2010
filenames.Age2010 <- sprintf("Age_2010_%s.csv", citylist)

for (i in filenames.Age2010){
  name <- gsub(".csv", "", i)
  name <- gsub("Age_2010_", "", name)
  name <- paste("Age", name, sep = ".")
  all_content <- readLines(i)
  skip_second <- all_content[-1]
  assign(name, read.csv(textConnection(skip_second), header = T))
}

Age2010 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))

Age2010[4] <- lapply(Age2010[4], function (x) gsub("\\([^\\)]+\\)", "", x))

###AGE2015
filenames.Age2015 <- sprintf("Age_2015_%s.csv", citylist)

for (i in filenames.Age2015){
  name <- gsub(".csv", "", i)
  name <- gsub("Age_2015_", "", name)
  name <- paste("Age", name, sep = ".")
  all_content <- readLines(i)
  skip_second <- all_content[-1]
  assign(name, read.csv(textConnection(skip_second), header = T))
}

Age2015 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))

Age2015[5] <- lapply(Age2015[5], function (x) gsub("\\*\\*\\*\\*\\*", NA, x))

###INCOME2000
filenames.Income2000 <- sprintf("Income_2000_%s.csv", citylist.nonash)

for (i in filenames.Income2000){
  name <- gsub(".csv", "", i)
  name <- gsub("Income_2000_", "", name)
  name <- paste("Age", name, sep = ".")
  all_content <- readLines(i)
  skip_second <- all_content[-1]
  assign(name, read.csv(textConnection(skip_second), header = T))
}

Income2000 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))


###INCOME2010
filenames.Income2010 <- sprintf("Income_2010_%s.csv", citylist)

for (i in filenames.Income2010){
  name <- gsub(".csv", "", i)
  name <- gsub("Income_2010_", "", name)
  name <- paste("Age", name, sep = ".")
  all_content <- readLines(i)
  skip_second <- all_content[-1]
  assign(name, read.csv(textConnection(skip_second), header = T))
}

Income2010 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))

###INCOME2015
filenames.Income2015 <- sprintf("Income_2015_%s.csv", citylist)

for (i in filenames.Income2015){
  name <- gsub(".csv", "", i)
  name <- gsub("Income_2015_", "", name)
  name <- paste("Age", name, sep = ".")
  all_content <- readLines(i)
  skip_second <- all_content[-1]
  assign(name, read.csv(textConnection(skip_second), header = T))
}

Income2015 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))

###INDUSTRY2015
filenames.Industry2015 <- sprintf("Industry_2015_%s.csv", citylist)

for (i in filenames.Industry2015){
  name <- gsub(".csv", "", i)
  name <- gsub("Industry_2015_", "", name)
  name <- paste("Age", name, sep = ".")
  all_content <- readLines(i)
  skip_second <- all_content[-1]
  assign(name, read.csv(textConnection(skip_second), header = T))
}

Industry2015 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))

###RACE2015
filenames.Race2015 <- sprintf("Race_2015_%s.csv", citylist)

for (i in filenames.Race2015){
  name <- gsub(".csv", "", i)
  name <- gsub("Race_2015_", "", name)
  name <- paste("Age", name, sep = ".")
  all_content <- readLines(i)
  skip_second <- all_content[-1]
  assign(name, read.csv(textConnection(skip_second), header = T))
}

Race2015 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))

Race2015[5] <- lapply(Race2015[5], function (x) gsub("\\*\\*\\*\\*\\*", NA, x))

###RACE2010
filenames.Race2010 <- sprintf("Race_2010_%s.csv", citylist)

for (i in filenames.Race2010){
  name <- gsub(".csv", "", i)
  name <- gsub("Race_2010_", "", name)
  name <- paste("Age", name, sep = ".")
  all_content <- readLines(i)
  skip_second <- all_content[-1]
  assign(name, read.csv(textConnection(skip_second), header = T))
}

Race2010 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))

Race2010$Number..RACE...Total.population <- lapply(Race2010$Number..RACE...Total.population, function (x) gsub("\\([^\\)]+\\)", "", x))
Race2010[52] <- lapply(Race2010[52], function (x) gsub("\\([^\\)]+\\)", "", x))
Race2010[38] <- lapply(Race2010[38], function (x) gsub("\\([^\\)]+\\)", "", x))

###RACE2000
filenames.Race2000 <- sprintf("Race_2000_%s.csv", citylist.nonash)

for (i in filenames.Race2000){
  name <- gsub(".csv", "", i)
  name <- gsub("Race_2000_", "", name)
  name <- paste("Age", name, sep = ".")
  all_content <- readLines(i)
  skip_second <- all_content[-1]
  assign(name, read.csv(textConnection(skip_second), header = T))
}

Race2000 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))

###Census 1990

Census_1990 <- read.csv("citylist.csv", header = T)

Next is the Canadian Census data sets:

canlist <- c("Montreal", "Ottawa", "Toronto", "Winnipeg", "Calgary", "Edmonton", "Vancouver")

#Census 1996
filenames.Census <- sprintf("Census_1996_%s.csv", canlist)

for (i in filenames.Census){
  name <- gsub(".csv", "", i)
  name <- gsub("Census_1996_", "", name)
  name <- paste("Census", name, sep = ".")
  assign(name, read.csv(i, header = F))
}

Can_Census1996 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg)) 


###Age 2001
filenames.Census <- sprintf("Age_2001_%s.csv", canlist)

for (i in filenames.Census){
  name <- gsub(".csv", "", i)
  name <- gsub("Age_2001_", "", name)
  name <- paste("Census", name, sep = ".")
  assign(name, read.csv(i, header = T))
}

Can_Age2001 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg)) 

###Age 2006
filenames.Census <- sprintf("Age_2006_%s.csv", canlist)

for (i in filenames.Census){
  name <- gsub(".csv", "", i)
  name <- gsub("Age_2006_", "", name)
  name <- paste("Census", name, sep = ".")
  assign(name, read.csv(i, header = T))
}

Can_Age2006 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg)) 

###Age 2011
filenames.Census <- sprintf("Age_2011_%s.csv", canlist)

for (i in filenames.Census){
  name <- gsub(".csv", "", i)
  name <- gsub("Age_2011_", "", name)
  name <- paste("Census", name, sep = ".")
  assign(name, read.csv(i, header = T))
}

Can_Age2011 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg)) 

###Income 2001
Can_Income2001 <- read.csv("Income_2001_Allcity.csv", header = T)

###Income 2011
filenames.Census <- sprintf("Income_2011_%s.csv", canlist)

for (i in filenames.Census){
  name <- gsub(".csv", "", i)
  name <- gsub("Income_2011_", "", name)
  name <- paste("Census", name, sep = ".")
  assign(name, read.csv(i, header = T))
}

Can_Income2011 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg)) 

###Industry 2001
filenames.Census <- sprintf("Industry_2001_%s.csv", canlist)

for (i in filenames.Census){
  name <- gsub(".csv", "", i)
  name <- gsub("Industry_2001_", "", name)
  name <- paste("Census", name, sep = ".")
  assign(name, read.csv(i, header = T))
}

Can_Industry2001 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg)) 

###Industry 2011
filenames.Census <- sprintf("Industry_2011_%s.csv", canlist)

for (i in filenames.Census){
  name <- gsub(".csv", "", i)
  name <- gsub("Industry_2011_", "", name)
  name <- paste("Census", name, sep = ".")
  assign(name, read.csv(i, header = T))
}

Can_Industry2011 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg)) 

###Race 2001
filenames.Census <- sprintf("Race_2001_%s.csv", canlist)

for (i in filenames.Census){
  name <- gsub(".csv", "", i)
  name <- gsub("Race_2001_", "", name)
  name <- paste("Census", name, sep = ".")
  assign(name, read.csv(i, header = T))
}

Can_Race2001 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg)) 

###Race 2011
filenames.Census <- sprintf("Race_2011_%s.csv", canlist)

for (i in filenames.Census){
  name <- gsub(".csv", "", i)
  name <- gsub("Race_2011_", "", name)
  name <- paste("Census", name, sep = ".")
  assign(name, read.csv(i, header = F, sep = ","))
}

Can_Race2011 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg)) 

###Can Census 2016
Can_Census2016 <- read.csv("Census_2016.csv", header = T)

The next step is to merge all this demographic information to the year by year team information found in the majourfour dataframe. I decided to create uniform census datasets that all contained the same 62 variables. Any missing variables were given NA’s. This was however quite difficult as each of the different demographic datasets contained different variables or different ways of interpreting these variables. I worked closely with the original data to know how to transform the data into the uniform datasets I was looking for:

##Can Census 1996

Can_Census1996 <- Can_Census1996[,c(1, 3, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 1622, 473, 474, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1036)]

names(Can_Census1996)[1] <- 'city'
names(Can_Census1996)[2] <- 'Population'
Can_Census1996 <- mutate(Can_Census1996, m.per.f.100 = (Can_Census1996$V7/Can_Census1996$V31)*100)
Can_Census1996 <- mutate(Can_Census1996, age.Less5 = (Can_Census1996$V8 + Can_Census1996$V32)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.5.9 = (Can_Census1996$V9 + Can_Census1996$V33)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.10.14 = (Can_Census1996$V10 + Can_Census1996$V34)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.15.19 = (Can_Census1996$V16 + Can_Census1996$V40)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.20.24 = (Can_Census1996$V17 + Can_Census1996$V41)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.25.29 = (Can_Census1996$V18 + Can_Census1996$V42)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.30.34 = (Can_Census1996$V19 + Can_Census1996$V43)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.35.39 = (Can_Census1996$V20 + Can_Census1996$V44)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.40.44 = (Can_Census1996$V21 + Can_Census1996$V45)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.45.49 = (Can_Census1996$V22 + Can_Census1996$V46)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.50.54 = (Can_Census1996$V23 + Can_Census1996$V47)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.55.59 = (Can_Census1996$V24 + Can_Census1996$V48)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.60.64 = (Can_Census1996$V25 + Can_Census1996$V49)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.65.69 = (Can_Census1996$V26 + Can_Census1996$V50)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.70.74 = (Can_Census1996$V27 + Can_Census1996$V51)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.75.79 = (Can_Census1996$V28 + Can_Census1996$V52)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.80.84 = (Can_Census1996$V29 + Can_Census1996$V53)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.Plus85 = (Can_Census1996$V30 + Can_Census1996$V54)/Can_Census1996[,2]*100)

Can_Census1996 <- mutate(Can_Census1996, Median.Income = Can_Census1996$V1622)

Can_Census1996 <- mutate(Can_Census1996, race.white = (Can_Census1996$V797 - Can_Census1996$V474)/Can_Census1996$V473*100) 
Can_Census1996 <- mutate(Can_Census1996, race.black = Can_Census1996$V786/Can_Census1996$V473*100) 
Can_Census1996 <- mutate(Can_Census1996, race.asian = (Can_Census1996$V787 + Can_Census1996$V788 + Can_Census1996$V789 + Can_Census1996$V790 + Can_Census1996$V791 + Can_Census1996$V792)/Can_Census1996$V473*100) 
Can_Census1996 <- mutate(Can_Census1996, race.latin = Can_Census1996$V794/Can_Census1996$V473*100)  
Can_Census1996 <- mutate(Can_Census1996, race.indigenous = Can_Census1996$V474/Can_Census1996$V473*100)  
Can_Census1996 <- mutate(Can_Census1996, race.other = (Can_Census1996$V795 + Can_Census1996$V793)/Can_Census1996$V473*100) 
Can_Census1996 <- mutate(Can_Census1996, race.mixed = Can_Census1996$V796/Can_Census1996$V473*100)

Can_Census1996 <- mutate(Can_Census1996, i.agri = (Can_Census1996$V1017 + Can_Census1996$V1018 + Can_Census1996$V1019)/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.mining = Can_Census1996$V1020/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.utilities = NA)
Can_Census1996 <- mutate(Can_Census1996, i.construction = Can_Census1996$V1022/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.manufacturing = Can_Census1996$V1021/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.wholesale = Can_Census1996$V1025/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.retail = Can_Census1996$V1026/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.transportation = Can_Census1996$V1023/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.info.culture = Can_Census1996$V1024/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.finance.insur = Can_Census1996$V1027/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.realestate = Can_Census1996$V1028/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.sci.technical = NA)
Can_Census1996 <- mutate(Can_Census1996, i.company.management = Can_Census1996$V1029/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.admin.waste = NA)
Can_Census1996 <- mutate(Can_Census1996, i.education = Can_Census1996$V1031/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.health = Can_Census1996$V1032/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.arts.entertain = NA)
Can_Census1996 <- mutate(Can_Census1996, i.food.accommo = Can_Census1996$V1033/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.other = Can_Census1996$V1034/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.pub.admin = Can_Census1996$V1030/Can_Census1996$V1016*100)

Can_Census1996 <- mutate(Can_Census1996, Median.Age = NA)
Can_Census1996 <- mutate(Can_Census1996, Male.Median = NA)
Can_Census1996 <- mutate(Can_Census1996, Female.Median = NA)
Can_Census1996 <- mutate(Can_Census1996, Year = 1996)
Can_Census1996$inc.0.10 <- NA
Can_Census1996$inc.10.15 <- NA
Can_Census1996$inc.15.25 <- NA
Can_Census1996$inc.25.35 <- NA
Can_Census1996$inc.35.50 <- NA
Can_Census1996$inc.50.75 <- NA
Can_Census1996$inc.75.100 <- NA
Can_Census1996$inc.100.150 <- NA
Can_Census1996$inc.150.200 <- NA
Can_Census1996$inc.200.plus <- NA


Can_Census1996 <- Can_Census1996[,c(1,126,76,123:125,77:95,127:136,96:122)]

names(Can_Census1996)[1] <- "City"


###Can Age 2001

Can_Age2001 <- mutate(Can_Age2001, m.per.f.100 = Can_Age2001$Male.Tot/Can_Age2001$Female.Tot*100)
Can_Age2001 <- mutate(Can_Age2001, Year = 2001)
Can_Age2001 <- mutate(Can_Age2001, age.Less5 = Can_Age2001[,4]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.5.9 = Can_Age2001[,10]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.10.14 = Can_Age2001[,16]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.15.19 = Can_Age2001[,22]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.20.24 = Can_Age2001[,28]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.25.29 = Can_Age2001[,34]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.30.34 = Can_Age2001[,40]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.35.39 = Can_Age2001[,46]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.40.44 = Can_Age2001[,52]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.45.49 = Can_Age2001[,58]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.50.54 = Can_Age2001[,64]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.55.59 = Can_Age2001[,70]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.60.64 = Can_Age2001[,76]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.65.69 = Can_Age2001[,82]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.70.74 = Can_Age2001[,88]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.75.79 = Can_Age2001[,94]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.80.84 = Can_Age2001[,100]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.Plus85 = (Can_Age2001[,106] + Can_Age2001[,112] + Can_Age2001[,118] + Can_Age2001[,124])/Can_Age2001[,1]*100)

Can_Age2001 <- Can_Age2001[,c(128,130,1,129,125,126,127,131:148)]

names(Can_Age2001)[3] <- "Total.Pop"
names(Can_Age2001)[5] <- "Median.Age"
names(Can_Age2001)[6] <- "Male.Median"
names(Can_Age2001)[7] <- "Female.Median"

Can_Age2001 <- Can_Age2001[,-3]

###Can Age 2006

Can_Age2006 <- mutate(Can_Age2006, m.per.f.100 = Can_Age2006$Male.Tot/Can_Age2006$Female.Tot*100)
Can_Age2006 <- mutate(Can_Age2006, Year = 2006)
Can_Age2006 <- mutate(Can_Age2006, age.Less5 = Can_Age2006[,9]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.5.9 = Can_Age2006[,15]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.10.14 = Can_Age2006[,21]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.15.19 = Can_Age2006[,28]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.20.24 = Can_Age2006[,34]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.25.29 = Can_Age2006[,41]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.30.34 = Can_Age2006[,47]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.35.39 = Can_Age2006[,54]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.40.44 = Can_Age2006[,60]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.45.49 = Can_Age2006[,67]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.50.54 = Can_Age2006[,73]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.55.59 = Can_Age2006[,80]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.60.64 = Can_Age2006[,86]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.65.69 = Can_Age2006[,93]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.70.74 = Can_Age2006[,99]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.75.79 = Can_Age2006[,105]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.80.84 = Can_Age2006[,112]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.Plus85 = (Can_Age2006[,118] + Can_Age2006[,124] + Can_Age2006[,130] + Can_Age2006[,136])/Can_Age2006[,1]*100)

Can_Age2006 <- Can_Age2006[,c(5,138,1,137,2,3,4,139:156)]
names(Can_Age2006) <- colnames(Can_Age2001)

Can_Age2006 <- Can_Age2006[,-3]

###Can Age 2011

Can_Age2011 <- mutate(Can_Age2011, m.per.f.100 = Can_Age2011$Male.Tot/Can_Age2011$Female.Tot*100)
Can_Age2011 <- mutate(Can_Age2011, Year = 2011)
Can_Age2011 <- mutate(Can_Age2011, age.Less5 = Can_Age2011[,5]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.5.9 = Can_Age2011[,11]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.10.14 = Can_Age2011[,17]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.15.19 = Can_Age2011[,24]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.20.24 = Can_Age2011[,30]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.25.29 = Can_Age2011[,37]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.30.34 = Can_Age2011[,43]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.35.39 = Can_Age2011[,50]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.40.44 = Can_Age2011[,56]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.45.49 = Can_Age2011[,63]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.50.54 = Can_Age2011[,69]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.55.59 = Can_Age2011[,76]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.60.64 = Can_Age2011[,82]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.65.69 = Can_Age2011[,89]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.70.74 = Can_Age2011[,95]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.75.79 = Can_Age2011[,101]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.80.84 = Can_Age2011[,108]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.Plus85 = (Can_Age2011[,114] + Can_Age2011[,120] + Can_Age2011[,126] + Can_Age2011[,132])/Can_Age2011[,1]*100)

Can_Age2011 <- Can_Age2011[,c(136,138,1,137,133,134,135,139:156)]
names(Can_Age2011) <- colnames(Can_Age2001)

Can_Age2011 <- Can_Age2011[,-3]

###USA Age 2000

Age2000 <- Age2000[,c(3,4,10, 284, 285, 286, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91, 98, 105, 112, 119, 126, 133,140)]
Age2000 <- mutate(Age2000, age.Plus85 = (Age2000[,24] + Age2000[,25]))
Age2000 <- mutate(Age2000, Year = 2000)
Age2000 <- Age2000[,c(1,27,2:23,26)]
Age2000 <- Age2000[,-3]
names(Age2000) <- colnames(Can_Age2001)



###USA Age 2010

Age2010 <- Age2010[,c(3,4,10, 284, 285, 286, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91, 98, 105, 112, 119, 126, 133,140)]
Age2010 <- mutate(Age2010, age.Plus85 = (Age2010[,24] + Age2010[,25]))
Age2010 <- mutate(Age2010, Year = 2010)
Age2010 <- Age2010[,c(1,27,2:23,26)]
Age2010 <- Age2010[,-3]
names(Age2010) <- colnames(Can_Age2001)



###USA Age 2015

Age2015 <- Age2015[,c(3,4,184, 178, 180, 182, 10, 16, 22, 28, 34, 40, 46, 52, 58, 64, 70, 76, 82, 88, 94, 100, 106, 112)]
Age2015 <- mutate(Age2015, Year = 2015)
Age2015 <- Age2015[,c(1,25,2:24)]
Age2015 <- Age2015[,-3]
names(Age2015) <- colnames(Can_Age2001)



###Income 2000 USA

Income2000 <- Income2000[,c(3, 118, 99, 101, 103,105, 107, 109, 111, 113, 115, 117)]
Income2000 <- mutate(Income2000, Year = 2000)
Income2000 <- Income2000[,c(1,13,2:12)]
names(Income2000) <- c("City", "Year", "Median.Income", "inc.0.10", "inc.10.15", "inc.15.25", "inc.25.35", "inc.35.50",
                       "inc.50.75", "inc.75.100", "inc.100.150", "inc.150.200", "inc.200.plus")


###Income 2010 USA

Income2010 <- Income2010[,c(3, 92, 12, 20, 28, 36, 44, 52, 60,68, 76, 84)]
Income2010 <- mutate(Income2010, Year = 2010)
Income2010 <- Income2010[,c(1,13,2:12)]
names(Income2010) <- colnames(Income2000)


###Income 2015 USA

Income2015 <- Income2015[,c(3, 92, 12, 20, 28, 36, 44, 52, 60,68, 76, 84)]
Income2015 <- mutate(Income2015, Year = 2015)
Income2015 <- Income2015[,c(1,13,2:12)]
names(Income2015) <- colnames(Income2000)


#Income CAN2001

Can_Income2001 <- Can_Income2001[,c(4,2)]
Can_Income2001$Year <- 2001
Can_Income2001 <- Can_Income2001[,c(1,3,2)]
Can_Income2001$inc.0.10 <- NA
Can_Income2001$inc.10.15 <- NA
Can_Income2001$inc.15.25 <- NA
Can_Income2001$inc.25.35 <- NA
Can_Income2001$inc.35.50 <- NA
Can_Income2001$inc.50.75 <- NA
Can_Income2001$inc.75.100 <- NA
Can_Income2001$inc.100.150 <- NA
Can_Income2001$inc.150.200 <- NA
Can_Income2001$inc.200.plus <- NA

names(Can_Income2001)[3] <- "Median.Income"

#Income CAN2011

Can_Income2011 <- mutate(Can_Income2011, inc.0.10 = (Can_Income2011[,20] + Can_Income2011[,21])/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.10.15 = Can_Income2011[,22]/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.15.25 = (Can_Income2011[,23] + (Can_Income2011[,24]/2))/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.25.35 = ((Can_Income2011[,25]/2) + (Can_Income2011[,24]/2))/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.35.50 = (Can_Income2011[,26] + (Can_Income2011[,25]/2))/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.50.75 = (Can_Income2011[,27] + (Can_Income2011[,28]/2))/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.75.100 = (Can_Income2011[,29] + (Can_Income2011[,28]/2))/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.100.150 = NA)
Can_Income2011 <- mutate(Can_Income2011, inc.150.200 = NA)
Can_Income2011 <- mutate(Can_Income2011, inc.inc.200.plus = NA)
Can_Income2011 <- mutate(Can_Income2011, Year = 2011)
Can_Income2011 <- Can_Income2011[,c(1,45,33,35:44)]
names(Can_Income2011) <- colnames(Income2000)

###Industry 2015 USA:

Industry2015 <- Industry2015[,c(3,4,24,34,104,44,54,64,74,94,114,134, 144, 164, 174, 184, 204, 214, 234, 244, 254, 264)] 

Industry2015 <- mutate(Industry2015, i.agri = Industry2015[,3]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.mining = Industry2015[,4]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.utilities = Industry2015[,5]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.construction = Industry2015[,6]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.manufacturing = Industry2015[,7]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.wholesale = Industry2015[,8]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.retail = Industry2015[,9]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.transportation = Industry2015[,10]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.info.culture = Industry2015[,11]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.finance.insur = Industry2015[,12]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.realestate = Industry2015[,13]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.sci.technical = Industry2015[,14]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.company.management = Industry2015[,15]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.admin.waste = Industry2015[,16]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.education = Industry2015[,17]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.health = Industry2015[,18]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.arts.entertain = Industry2015[,19]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.food.accommo = Industry2015[,20]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.other = Industry2015[,21]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.pub.admin = Industry2015[,22]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, Year = 2015)


Industry2015 <- Industry2015[,c(1,43,23:42)]
names(Industry2015)[1] <- "City"

###CAN Industry 2001

Can_Industry2001 <- mutate(Can_Industry2001, i.agri = Can_Industry2001[,4]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.mining = Can_Industry2001[,5]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.utilities = Can_Industry2001[,6]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.construction = Can_Industry2001[,7]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.manufacturing = Can_Industry2001[,8]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.wholesale = Can_Industry2001[,9]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.retail = Can_Industry2001[,10]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.transportation = Can_Industry2001[,11]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.info.culture = Can_Industry2001[,12]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.finance.insur = Can_Industry2001[,13]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.realestate = Can_Industry2001[,14]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.sci.technical = Can_Industry2001[,15]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.company.management = Can_Industry2001[,16]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.admin.waste = Can_Industry2001[,17]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.education = Can_Industry2001[,18]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.health = Can_Industry2001[,19]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.arts.entertain = Can_Industry2001[,20]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.food.accommo = Can_Industry2001[,21]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.other = Can_Industry2001[,22]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.pub.admin = Can_Industry2001[,23]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, Year = 2001)

Can_Industry2001 <- Can_Industry2001[,c(24,45,25:44)]
names(Can_Industry2001)[1] <- "City" 

###CAN Industry 2011

Can_Industry2011 <- Can_Industry2011[,c(1, 2, 3, 17, 26, 31, 45, 153, 189, 229, 270, 289, 306, 318, 329, 332, 346, 355, 373, 386, 396, 415)]

Can_Industry2011 <- mutate(Can_Industry2011, i.agri = Can_Industry2011[,3]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.mining = Can_Industry2011[,4]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.utilities = Can_Industry2011[,5]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.construction = Can_Industry2011[,6]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.manufacturing = Can_Industry2011[,7]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.wholesale = Can_Industry2011[,8]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.retail = Can_Industry2011[,9]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.transportation = Can_Industry2011[,10]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.info.culture = Can_Industry2011[,11]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.finance.insur = Can_Industry2011[,12]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.realestate = Can_Industry2011[,13]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.sci.technical = Can_Industry2011[,14]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.company.management = Can_Industry2011[,15]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.admin.waste = Can_Industry2011[,16]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.education = Can_Industry2011[,17]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.health = Can_Industry2011[,18]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.arts.entertain = Can_Industry2011[,19]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.food.accommo = Can_Industry2011[,20]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.other = Can_Industry2011[,21]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.pub.admin = Can_Industry2011[,22]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, Year = 2011)

Can_Industry2011 <- Can_Industry2011[,c(1,43,23:42)]
names(Can_Industry2011)[1] <- "City"

###Race USA 2000


Race2000 <- Race2000[,c(3, 6, 9, 11, 23, 83, 13, 53, 55)]
Race2000 <- Race2000[,-2]
Race2000$Year <- 2000
Race2000 <- Race2000[,c(1,9,2:8)]
names(Race2000) <- c("City", "Year", "race.white", "race.black", "race.asian", "race.latin", "race.indigenous", "race.other", "race.mixed")

###Race USA 2010


Race2010 <- Race2010[,c(3, 9, 11, 13, 57, 23, 27, 29)]
Race2010$Year <- 2010
Race2010 <- Race2010[,c(1,9,2,3,6,5,4,7,8)]
names(Race2010) <- c("City", "Year", "race.white", "race.black", "race.asian", "race.latin", "race.indigenous", "race.other", "race.mixed")


###Race USA 2015


Race2015 <- Race2015[,c(3, 130, 134, 158, 266, 138, 210, 214)]
Race2015$Year <- 2015
Race2015 <- Race2015[,c(1,9,2:8)]
names(Race2015) <- colnames(Race2000)

##Race CAN 2001

Can_Race2001 <- mutate(Can_Race2001, race.white = Can_Race2001[,4]/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.black = Can_Race2001[,7]/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.asian = (Can_Race2001[,5] + Can_Race2001[,6] + Can_Race2001[,8] + Can_Race2001[,10] + Can_Race2001[,13] + Can_Race2001[,14])/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.latin = Can_Race2001[,9]/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.indigenous = Can_Race2001[,29]/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.other = (Can_Race2001[,11] + Can_Race2001[,12] + Can_Race2001[,15])/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.mixed = Can_Race2001[,16]/Can_Race2001[,2]*100)
Can_Race2001$Year <- 2001

Can_Race2001 <- Can_Race2001[,c(1,37,30:36)]
names(Can_Race2001)[1] <- "City"

##Race CAN 2011

Can_Race2011 <- mutate(Can_Race2011, race.white = (Can_Race2011[,180] - Can_Race2011[,182])/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.black = Can_Race2011[,170]/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.asian = (Can_Race2011[,168] + Can_Race2011[,169] + Can_Race2011[,171] + Can_Race2011[,174] + Can_Race2011[,176] + Can_Race2011[,177])/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.latin = Can_Race2011[,172]/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.indigenous = Can_Race2011[,182]/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.other = (Can_Race2011[,178] + Can_Race2011[,175] + Can_Race2011[,173])/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.mixed = Can_Race2011[,179]/Can_Race2011[,166]*100)
Can_Race2011$Year <- 2011

Can_Race2011 <- Can_Race2011[,c(1,336,329:335)]
names(Can_Race2011)[1] <- "City"

####CAN Census 2016

Can_Census2016 <- mutate(Can_Census2016, age.Less5 = Can_Census2016[,10]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.5.9 = Can_Census2016[,11]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.10.14 = Can_Census2016[,12]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.15.19 = Can_Census2016[,13]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.20.24 = Can_Census2016[,14]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.25.29 = Can_Census2016[,15]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.30.34 = Can_Census2016[,16]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.35.39 = Can_Census2016[,17]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.40.44 = Can_Census2016[,18]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.45.49 = Can_Census2016[,19]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.50.54 = Can_Census2016[,20]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.55.59 = Can_Census2016[,21]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.60.64 = Can_Census2016[,22]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.65.69 = Can_Census2016[,23]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.70.74 = Can_Census2016[,24]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.75.79 = Can_Census2016[,25]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.80.84 = Can_Census2016[,26]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.Plus85 = Can_Census2016[,27]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, m.per.f.100 = (Can_Census2016[,4] / Can_Census2016[,5])*100)
Can_Census2016$Year <- 2016
Can_Census2016 <- Can_Census2016[,-c(3,4,5)]


### US Census 1990

Census_1990 <- mutate(Census_1990, age.Less5 = Census_1990[,8]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.5.9 = Census_1990[,9]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.10.14 = Census_1990[,10]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.15.19 = Census_1990[,11]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.20.24 = Census_1990[,12]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.25.29 = Census_1990[,13]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.30.34 = Census_1990[,14]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.35.39 = Census_1990[,15]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.40.44 = Census_1990[,16]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.45.49 = Census_1990[,17]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.50.54 = Census_1990[,18]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.55.59 = Census_1990[,19]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.60.64 = Census_1990[,20]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.65.69 = Census_1990[,21]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.70.74 = Census_1990[,22]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.75.79 = Census_1990[,23]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.80.84 = Census_1990[,24]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.Plus85 = Census_1990[,25]/Census_1990[,7]*100)

Census_1990 <- Census_1990[,-7]

Now that I have my data compiled into uniform sets, it is time for some cleaning. I will: - Make sure the city names match the names in the majorfour dataset for when I merge - Convert CAD to USD - Convert prices in previous years to one uniform price based on inflation

##City Name Change
city.frame <- as.data.frame(table(majorfour$City))
city.frame <- city.frame[1]
names(city.frame)[1] <- "City"

citylist2 <- as.character(city.frame$City)
citylist2 <- citylist2[-c(6, 15, 17, 27, 35, 40, 51,52,54)]

city.total <- data.frame(citylist2)
city.nonash <- data.frame(city.total[-24,])

Age2000 <- cbind(city.nonash, Age2000)
Age2000 <- Age2000[,-2]
names(Age2000)[1] <- "City"

Age2010 <- cbind(city.total, Age2010)
Age2010 <- Age2010[,-2]
names(Age2010)[1] <- "City"

Age2015 <- cbind(city.total, Age2015)
Age2015 <- Age2015[,-2]
names(Age2015)[1] <- "City"

Income2000 <- cbind(city.nonash, Income2000)
Income2000 <- Income2000[,-2]
names(Income2000)[1] <- "City"

Income2010 <- cbind(city.total, Income2010)
Income2010 <- Income2010[,-2]
names(Income2010)[1] <- "City"

Income2015 <- cbind(city.total, Income2015)
Income2015 <- Income2015[,-2]
names(Income2015)[1] <- "City"

Industry2015 <- cbind(city.total, Industry2015)
Industry2015 <- Industry2015[,-2]
names(Industry2015)[1] <- "City"

Race2000 <- cbind(city.nonash, Race2000)
Race2000 <- Race2000[,-2]
names(Race2000)[1] <- "City"

Race2010 <- cbind(city.total, Race2010)
Race2010 <- Race2010[,-2]
names(Race2010)[1] <- "City"

Race2015 <- cbind(city.total, Race2015)
Race2015 <- Race2015[,-2]
names(Race2015)[1] <- "City"

###Converting CAD to USD

conversion <- read.csv("Conversion_USCAD.csv", header = T)

Can_Income2001$Median.Income <- Can_Income2001$Median.Income *0.646223
Can_Income2011$Median.Income <- Can_Income2011$Median.Income *1.011464
Can_Census1996$Median.Income <- Can_Census1996$Median.Income *0.733274
Can_Census2016$Median.Income <- Can_Census2016$Median.Income* 0.782992

###Inflation conversion

Income2000 <- mutate(Income2000, adj.median.income = Income2000$Median.Income * 1.42)
Income2000 <- Income2000[,c(1:3, 14, 4:13)]
Income2010 <- mutate(Income2010, adj.median.income = Income2010$Median.Income *1.12)
Income2010 <- Income2010[,c(1:3, 14, 4:13)]
Income2015 <- mutate(Income2015, adj.median.income = Income2015$Median.Income *1.03)
Income2015 <- Income2015[,c(1:3, 14, 4:13)]
Census_1990 <- mutate(Census_1990, adj.median.income = Census_1990$Median.Income *1.87)
Census_1990 <- Census_1990[,c(1:25, 63, 26:62)]
                     
Can_Income2001 <- mutate(Can_Income2001, adj.median.income = Can_Income2001$Median.Income * 1.38)
Can_Income2001 <- Can_Income2001[,c(1:3, 14, 4:13)]              
Can_Income2011 <- mutate(Can_Income2011, adj.median.income = Can_Income2011$Median.Income * 1.09)
Can_Income2011 <- Can_Income2011[,c(1:3, 14, 4:13)]  
Can_Census1996 <- mutate(Can_Census1996, adj.median.income = Can_Census1996$Median.Income * 1.56)
Can_Census1996 <- Can_Census1996[,c(1:25, 63, 26:62)]
Can_Census2016 <- mutate(Can_Census2016, adj.median.income = Can_Census2016$Median.Income * 1.03)
Can_Census2016 <- Can_Census2016[,c(1:25, 63, 26:62)]

###Inflation for the ticket Prices

inflation <- read.csv("inflation.csv", header = T)

majorfour <- merge(majorfour, inflation, by = "Year", all.x = T)
majorfour <- mutate(majorfour, adj.ticket.price = majorfour$Ave.Tick*majorfour$Conversion)
majorfour <- majorfour[,c(1:11,23,12:14,19:21)]

My next step is to bind each individual demographic dataset together for each Census. Creating a dataset that contains all the variables for each Census:

Can_Census2001 <- merge(Can_Age2001, Can_Income2001, by = c("City", "Year"))
Can_Census2001 <- merge(Can_Census2001, Can_Race2001, by = c("City", "Year"))
Can_Census2001 <- merge(Can_Census2001, Can_Industry2001, by = c("City", "Year"))

Can_Census2011 <- merge(Can_Age2011, Can_Income2011, by = c("City", "Year"))
Can_Census2011 <- merge(Can_Census2011, Can_Race2011, by = c("City", "Year"))
Can_Census2011 <- merge(Can_Census2011, Can_Industry2011, by = c("City", "Year"))


US_Census2000 <- merge(Age2000, Income2000, by = c("City", "Year"))
US_Census2000 <- merge(US_Census2000, Race2000, by = c("City", "Year"))
nashville <- c("Nashville", 2000, rep(NA, 41))
US_Census2000 <- rbind(US_Census2000, nashville)
US_Census2000 <- US_Census2000[c(1:23,45,24:44),]

US_Census2010 <- merge(Age2010, Income2010, by = c("City", "Year"))
US_Census2010 <- merge(US_Census2010, Race2010, by = c("City", "Year"))

US_Census2015 <- merge(Age2015, Income2015, by = c("City", "Year"))
US_Census2015 <- merge(US_Census2015, Race2015, by = c("City", "Year"))
US_Census2015 <- merge(US_Census2015, Industry2015, by = c("City", "Year"))

head(US_Census2015)

##        City Year m.per.f.100 Median.Age Male.Median Female.Median
## 1   Anaheim 2015        99.6       33.6        32.3          34.8
## 2   Atlanta 2015        97.1       33.4        33.0          33.7
## 3 Baltimore 2015        89.1       34.6        33.3          35.9
## 4    Boston 2015        92.1       31.6        30.9          32.1
## 5   Buffalo 2015        91.3       33.1        31.5          34.6
## 6 Charlotte 2015        92.2       33.7        32.8          34.7
##   age.Less5 age.5.9 age.10.14 age.15.19 age.20.24 age.25.29 age.30.34
## 1       7.2     7.0       7.0       7.2       7.8       8.2       7.6
## 2       6.1     5.1       5.0       6.4      10.4      10.6       9.4
## 3       6.7     5.9       5.4       6.2       8.4       9.8       8.3
## 4       5.4     4.3       4.2       7.4      11.9      13.3       9.5
## 5       6.6     6.1       6.2       7.4       9.7       9.2       7.3
## 6       7.2     7.1       6.5       6.4       7.3       8.8       8.6
##   age.35.39 age.40.44 age.45.49 age.50.54 age.55.59 age.60.64 age.65.69
## 1       7.0       7.2       6.8       6.5       5.6       4.6       3.3
## 2       7.5       7.0       6.3       6.0       5.1       4.7       3.4
## 3       6.2       5.9       6.3       7.0       6.4       5.6       4.0
## 4       6.6       5.9       5.7       5.6       5.2       4.5       3.3
## 5       5.7       5.7       5.7       6.5       6.8       5.1       3.5
## 6       7.8       7.6       6.8       6.4       5.6       4.6       3.2
##   age.70.74 age.75.79 age.80.84 age.Plus85 Median.Income adj.median.income
## 1       2.5       1.7       1.3        1.4         60752          62574.56
## 2       2.7       1.8       1.3        1.3         47527          48952.81
## 3       2.7       2.1       1.6        1.5         42241          43508.23
## 4       2.5       1.7       1.5        1.6         55777          57450.31
## 5       2.7       2.1       1.9        1.6         31918          32875.54
## 6       2.2       1.6       1.2        1.1         53637          55246.11
##   inc.0.10 inc.10.15 inc.15.25 inc.25.35 inc.35.50 inc.50.75 inc.75.100
## 1      5.1       5.1       8.9       9.1      12.7      18.0       13.5
## 2     12.8       6.3      11.1       9.4      12.0      14.9        9.3
## 3     13.1       7.5      11.6      11.1      13.0      16.7       10.0
## 4     12.0       7.3       9.3       7.2      10.2      14.9       10.4
## 5     16.6       9.5      15.0      12.3      13.4      15.1        8.2
## 6      6.9       4.8      10.0      10.9      14.1      17.8       11.5
##   inc.100.150 inc.150.200 inc.200.plus race.white race.black race.asian
## 1        15.5         6.8          5.2       68.2        2.3       16.0
## 2        10.5         4.9          8.8       40.0       52.9        3.9
## 3         9.9         3.7          3.4       30.3       62.8        2.6
## 4        14.1         6.5          8.1       53.0       25.2        9.3
## 5         6.7         1.7          1.4       48.5       37.3        4.4
## 6        12.4         4.9          6.7       51.6       34.9        5.8
##   race.latin race.indigenous race.other race.mixed     i.agri    i.mining
## 1       53.1             0.3        9.6        3.2 0.37165859 0.048902446
## 2        5.0             0.2        1.0        1.9 0.14068773 0.017761242
## 3        4.6             0.3        1.7        2.3 0.13029196 0.023253239
## 4       18.8             0.4        7.5        4.5 0.10680266 0.003762695
## 5       10.8             0.4        5.5        3.8 0.06069635 0.083687395
## 6       13.5             0.3        4.6        2.7 0.18901332 0.034117029
##   i.utilities i.construction i.manufacturing i.wholesale  i.retail
## 1   0.3594330       7.421557       13.757481    3.953763 11.161983
## 2   0.4954452       2.938551        4.937625    2.657175  9.692964
## 3   0.6167645       4.863618        4.705274    1.924113  9.588824
## 4   0.4350255       3.338090        4.384987    1.448059  8.396600
## 5   0.3853299       3.317148        8.880060    2.026890 10.466442
## 6   0.8947627       6.245658        7.642214    3.202767 10.749852
##   i.transportation i.info.culture i.finance.insur i.realestate
## 1         2.961043       1.844845        4.700136     2.121144
## 2         4.869852       3.832222        5.524681     2.785243
## 3         4.479017       1.940723        3.342930     2.161075
## 4         2.719850       2.426649        7.193405     1.964706
## 5         4.110798       1.711453        5.206092     1.663632
## 6         4.374700       2.802577       10.781977     2.273638
##   i.sci.technical i.company.management i.admin.waste i.education i.health
## 1        5.808388           0.06418446      6.472239    6.299246 11.86985
## 2       14.939542           0.12432870      4.883874   11.983697  9.95097
## 3        7.135422           0.03654080      5.076588   12.747204 18.58524
## 4       11.675065           0.06657077      4.454163   13.356411 18.42274
## 5        5.236440           0.02850889      5.748680   10.785558 20.07486
## 6        8.374112           0.14045259      5.283657    7.170553 11.74348
##   i.arts.entertain i.food.accommo  i.other i.pub.admin
## 1         3.738592       9.136199 5.419614    2.489746
## 2         2.642218       8.821261 4.630075    4.131826
## 3         1.939246       6.962315 4.911601    8.829956
## 4         2.081060       8.549134 4.605829    4.371094
## 5         2.285310       9.826372 4.091486    4.010557
## 6         2.244751       8.873416 4.978347    1.999955

Census are not tallied every year. I perform some interpolation for the years in between to help create a complete data set. For the any years outside of my max or min census years, their missing values will unfortunately be kept as NA’s.

###interpolation of demographic information for Canada
all.can.census <- bind_rows(Can_Census1996, Can_Census2001, Can_Census2011, Can_Census2016)
remain.years <- data.frame(canlist, rep(c(1997:2000,2002:2010,2012:2015), each = 7))
names(remain.years) <- c("City", "Year")
all.can.census <- bind_rows(all.can.census, remain.years)
all.can.census <- all.can.census[order(all.can.census[,1], all.can.census[,2]), ]
all.can.census[,c(3,7:26)] <- lapply(all.can.census[,c(3,7:26)], na.approx)

can.setp.2 <- all.can.census[,c(1,2,4:6)]
can.setp.2 <- subset(can.setp.2, Year > 2000)
can.setp.2[,c(3:5)] <- na.approx(can.setp.2[,c(3:5)])

can.setp.3 <- all.can.census[,c(1,2,37:45,47:54,56,58,59,61:63)]
can.setp.3 <- subset(can.setp.3, Year < 2012)
can.setp.3[,c(3:25)] <- na.approx(can.setp.3[,c(3:25)])

can.setp.4 <- all.can.census[,c(1,2,46,55,57,60)]
can.setp.4 <- subset(can.setp.4, Year > 2000 & Year < 2012)
can.setp.4[,c(3:6)] <- na.approx(can.setp.4[,c(3:6)])


all.can.census <- all.can.census[,c(1,2,3,7:36 )]
all.can.census <- merge(all.can.census, can.setp.2, all.x = T, by = c("City", "Year"))
all.can.census <- merge(all.can.census, can.setp.3, all.x = T, by = c("City", "Year"))
all.can.census <- merge(all.can.census, can.setp.4, all.x = T, by = c("City", "Year"))
all.can.census <- all.can.census[,c(1:3,34:36,4:33,37:45,60,46:53,61,54,62,55,56,63,57:59)]

outside.census.years <- data.frame(canlist, rep(c(1991:1995), each = 7))
names(outside.census.years) <- c("City", "Year")
all.can.census <- bind_rows(all.can.census, outside.census.years)
all.can.census <- all.can.census[order(all.can.census[,1], all.can.census[,2]), ]

###interpolation of demographic information for USA
citylist <- as.character(city.frame$City)
citylist <- citylist[-c(6, 15,17, 27, 35, 40, 51,52,54)]

US_Census2000[,-1] <- lapply(US_Census2000[,-1], as.numeric)
US_Census2010[,-1] <- lapply(US_Census2010[,-1], as.numeric)
US_Census2015[,-1] <- lapply(US_Census2015[,-1], as.numeric)


all.us.census <- bind_rows(Census_1990,US_Census2000, US_Census2010, US_Census2015)
remain.years.us <- data.frame(citylist, rep(c(1991:1999, 2001:2009,2011:2014), each = 45))
names(remain.years.us) <- c("City", "Year")
remain.years.us$Year <- as.numeric(remain.years.us$Year)

all.us.census <- bind_rows(all.us.census, remain.years.us)
all.us.census <- all.us.census[order(all.us.census[,1], all.us.census[,2]), ]

all.us.census[,c(3:26,37:42)] <- lapply(all.us.census[,c(3:26,37:42)], na.approx)

us.setp.2 <- all.us.census[,c(1,2,27:36,43)]
us.setp.2 <- subset(us.setp.2, Year > 1999)
us.setp.2[,c(3:13)] <- na.approx(us.setp.2[,c(3:13)])

all.us.census <- all.us.census[,c(1,2,3:26,37:42,44:63)]
all.us.census <- merge(all.us.census, us.setp.2, all.x = T, by = c("City", "Year"))
all.us.census <- all.us.census[,c(1:26,53:62,27:32,63,33:52)]

outside.census.years.us <- data.frame(citylist, rep(2016, each = 45))
names(outside.census.years.us) <- c("City", "Year")

all.us.census <- bind_rows(all.us.census, outside.census.years.us)
all.us.census <- all.us.census[order(all.us.census[,1], all.us.census[,2]), ]

all.us.census <- subset(all.us.census, all.us.census$City != "Hartford")

###American Industry Calculation - Giving each industry the 2015 values because no other Industry information was found

use <- subset(all.us.census, all.us.census$Year == 2015)
use <- use[,c(1,2,44:63)]
use.duplicated <- expandRows(use, count = 27, count.is.col = F)
use.duplicated <- use.duplicated[,-c(1,2)]
all.us.census <- cbind(all.us.census, use.duplicated) 
all.us.census <- all.us.census[,-c(44:63)]

Finally I can bind all the info together and merge the demographic variables to the majorfour dataset

demographics <- rbind(all.us.census, all.can.census)
majorfour.dem <- merge(majorfour, demographics, all.x = T, by =  c("City", "Year"))

With the demographic variables attached to the data on each team by year, I can now start creating some new variables. These variables will help me calculate the correlation between year by year variance in winning with year by year variance in attendance. They will also help me determine if there are any significant variables that could predict what type of city would have these high or low correlations.

jump.frame <- majorfour.dem

###Calculate the percentage of capacity to population
jump.frame <- mutate(jump.frame, pct.capTOpop = jump.frame$Capacity/jump.frame$Pop *100)

###Calculate the zscore of the pct.capTOpop
jump.frame <- mutate(jump.frame, z.pct.cap.pop = scale(jump.frame$pct.capTOpop))
jump.frame$z.pct.cap.pop <- as.numeric(jump.frame$z.pct.cap.pop)

###Calculate the percentage of attendance to population
jump.frame <- mutate(jump.frame, pct.attTOpop = jump.frame$Ave.Attendance/jump.frame$Pop *100)

###Calculate the zscore of the pct.capTOpop
jump.frame <- mutate(jump.frame, z.pct.att.pop = scale(jump.frame$pct.attTOpop))
jump.frame$z.pct.att.pop <- as.numeric(jump.frame$z.pct.att.pop)

###Calculate the percentage of ticket price to income
jump.frame <- mutate(jump.frame, pct.tick.income = jump.frame$adj.ticket.price/jump.frame$adj.median.income*100)

###Calculate the percentage of payroll vs salary cap
jump.frame <- mutate(jump.frame, payroll.pct = jump.frame$Payroll/jump.frame$Salary.Cap*100)

###Calcule the salary cap pct by winning pct
jump.frame <- mutate(jump.frame, winning.per = jump.frame$Wpct/jump.frame$payroll.pct*100)

### Calculating the 1 year difference in ticket price
diff.func.tick <- function (t,s,y) {
  y2 <- y-1
  diff.att.2 <- jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
  diff.att.2 <- diff.att.2/jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]*100
  result <- diff.att.2
}

jump.frame$tickprice.diff.1 <- mapply(diff.func.tick, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$tickprice.diff.1 <- as.numeric(jump.frame$tickprice.diff.1)
jump.frame$tickprice.diff.1 <- round(jump.frame$tickprice.diff.1, digits = 4)

### Calculating the 2 year difference in ticket price
diff.func.tick.2 <- function (t,s,y) {
  y2 <- y-2
  diff.att.2 <- jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
  diff.att.2 <- diff.att.2/jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]*100
  result <- diff.att.2
}

jump.frame$tickprice.diff.2 <- mapply(diff.func.tick.2, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$tickprice.diff.2 <- as.numeric(jump.frame$tickprice.diff.2)
jump.frame$tickprice.diff.2 <- round(jump.frame$tickprice.diff.2, digits = 4)

### Calculating the 3 year difference in ticket price
diff.func.tick.3 <- function (t,s,y) {
  y2 <- y-3
  diff.att.2 <- jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
  diff.att.2 <- diff.att.2/jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]*100
  result <- diff.att.2
}

jump.frame$tickprice.diff.3 <- mapply(diff.func.tick.3, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$tickprice.diff.3 <- as.numeric(jump.frame$tickprice.diff.3)
jump.frame$tickprice.diff.3 <- round(jump.frame$tickprice.diff.3, digits = 4)

### Calculating the 1 year difference in attendance
diff.func.2 <- function (t,s,y) {
  y2 <- y-1
  diff.att.2 <- jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
  result <- diff.att.2
}

jump.frame$att.diff.control <- mapply(diff.func.2, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$att.diff.control <- as.numeric(jump.frame$att.diff.control)
jump.frame$att.diff.control <- jump.frame$att.diff.control*100
jump.frame$att.diff.control <- round(jump.frame$att.diff.control, digits = 4)


### Calculating the 2 year difference in attendance
diff.func.3 <- function (t,s,y) {
  y2 <- y-2
  diff.att.3 <- jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
  result <- diff.att.3
}

jump.frame$att.diff.control.2 <- mapply(diff.func.3, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$att.diff.control.2 <- as.numeric(jump.frame$att.diff.control.2)
jump.frame$att.diff.control.2 <- jump.frame$att.diff.control.2*100
jump.frame$att.diff.control.2 <- round(jump.frame$att.diff.control.2, digits = 4)

### Calculating the 3 year difference in attendance
diff.func.4 <- function (t,s,y) {
  y2 <- y-3
  diff.att.3 <- jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
  result <- diff.att.3
}

jump.frame$att.diff.control.3 <- mapply(diff.func.4, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$att.diff.control.3 <- as.numeric(jump.frame$att.diff.control.3)
jump.frame$att.diff.control.3 <- jump.frame$att.diff.control.3*100
jump.frame$att.diff.control.3 <- round(jump.frame$att.diff.control.3, digits = 4)


### Calculating the 1 year difference in winning percentage
diff.func.win <- function (t,y) {
  y2 <- y-1
  diff.win <- jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y] - jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y2]
  result <- diff.win
}

jump.frame$win.diff <- mapply(diff.func.win, t = jump.frame$Team, y= jump.frame$Year)
jump.frame$win.diff <- as.numeric(jump.frame$win.diff)
jump.frame$win.diff <- jump.frame$win.diff*100


### Calculating the 2 year difference in winning percentage
diff.func.win.2 <- function (t,y) {
  y2 <- y-2
  diff.win <- jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y] - jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y2]
  result <- diff.win
}

jump.frame$win.diff.2 <- mapply(diff.func.win.2, t = jump.frame$Team, y= jump.frame$Year)
jump.frame$win.diff.2 <- as.numeric(jump.frame$win.diff.2)
jump.frame$win.diff.2 <- jump.frame$win.diff.2*100

### Calculating the 3 year difference in winning percentage
diff.func.win.3 <- function (t,y) {
  y2 <- y-3
  diff.win <- jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y] - jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y2]
  result <- diff.win
}

jump.frame$win.diff.3 <- mapply(diff.func.win.3, t = jump.frame$Team, y= jump.frame$Year)
jump.frame$win.diff.3 <- as.numeric(jump.frame$win.diff.3)
jump.frame$win.diff.3 <- jump.frame$win.diff.3*100

I need to standardize the winning pct jumps between each league. It is much easier for a football team to have a 70% increase or decrease in winning percentage year over year because they only play 16 games a year. Compare that to an MLB team who plays 162 - you would see a lot less variance in winning percentage in that league.

mlb.mean.win <- mean(jump.frame$win.diff[jump.frame$League == "MLB"], na.rm = T)
nba.mean.win <- mean(jump.frame$win.diff[jump.frame$League == "NBA"], na.rm = T)
nhl.mean.win <- mean(jump.frame$win.diff[jump.frame$League == "NHL"], na.rm = T)
nfl.mean.win <- mean(jump.frame$win.diff[jump.frame$League == "NFL"], na.rm = T)

mlb.sd.win <- sd(jump.frame$win.diff[jump.frame$League == "MLB"], na.rm = T)
nba.sd.win <- sd(jump.frame$win.diff[jump.frame$League == "NBA"], na.rm = T)
nhl.sd.win <- sd(jump.frame$win.diff[jump.frame$League == "NHL"], na.rm = T)
nfl.sd.win <- sd(jump.frame$win.diff[jump.frame$League == "NFL"], na.rm = T)

jump.frame$mean.win <- ifelse(jump.frame$League == "MLB", mlb.mean.win, 
                              ifelse(jump.frame$League == "NBA", nba.mean.win, 
                                     ifelse(jump.frame$League == "NHL", nhl.mean.win,
                                            nfl.mean.win)))

jump.frame$sd.win <- ifelse(jump.frame$League == "MLB", mlb.sd.win, 
                              ifelse(jump.frame$League == "NBA", nba.sd.win, 
                                     ifelse(jump.frame$League == "NHL", nhl.sd.win,
                                            nfl.sd.win)))

jump.frame <- mutate(jump.frame, z.win.diff = (win.diff-mean.win)/sd.win)
jump.frame <- subset(jump.frame, select = -c(mean.win, sd.win))

###Standardizing the percentage jump in winning for the 2 year span

mlb.mean.win.2 <- mean(jump.frame$win.diff.2[jump.frame$League == "MLB"], na.rm = T)
nba.mean.win.2 <- mean(jump.frame$win.diff.2[jump.frame$League == "NBA"], na.rm = T)
nhl.mean.win.2 <- mean(jump.frame$win.diff.2[jump.frame$League == "NHL"], na.rm = T)
nfl.mean.win.2 <- mean(jump.frame$win.diff.2[jump.frame$League == "NFL"], na.rm = T)

mlb.sd.win.2 <- sd(jump.frame$win.diff.2[jump.frame$League == "MLB"], na.rm = T)
nba.sd.win.2 <- sd(jump.frame$win.diff.2[jump.frame$League == "NBA"], na.rm = T)
nhl.sd.win.2 <- sd(jump.frame$win.diff.2[jump.frame$League == "NHL"], na.rm = T)
nfl.sd.win.2 <- sd(jump.frame$win.diff.2[jump.frame$League == "NFL"], na.rm = T)

jump.frame$mean.win.2 <- ifelse(jump.frame$League == "MLB", mlb.mean.win.2, 
                              ifelse(jump.frame$League == "NBA", nba.mean.win.2, 
                                     ifelse(jump.frame$League == "NHL", nhl.mean.win.2,
                                            nfl.mean.win.2)))

jump.frame$sd.win.2 <- ifelse(jump.frame$League == "MLB", mlb.sd.win.2, 
                            ifelse(jump.frame$League == "NBA", nba.sd.win.2, 
                                   ifelse(jump.frame$League == "NHL", nhl.sd.win.2,
                                          nfl.sd.win.2)))

jump.frame <- mutate(jump.frame, z.win.diff.2 = (win.diff.2-mean.win.2)/sd.win.2)

jump.frame <- subset(jump.frame, select = -c(mean.win.2, sd.win.2))

###Standardizing the percentage jump in winning for the 3 year span

mlb.mean.win.3 <- mean(jump.frame$win.diff.3[jump.frame$League == "MLB"], na.rm = T)
nba.mean.win.3 <- mean(jump.frame$win.diff.3[jump.frame$League == "NBA"], na.rm = T)
nhl.mean.win.3 <- mean(jump.frame$win.diff.3[jump.frame$League == "NHL"], na.rm = T)
nfl.mean.win.3 <- mean(jump.frame$win.diff.3[jump.frame$League == "NFL"], na.rm = T)

mlb.sd.win.3 <- sd(jump.frame$win.diff.3[jump.frame$League == "MLB"], na.rm = T)
nba.sd.win.3 <- sd(jump.frame$win.diff.3[jump.frame$League == "NBA"], na.rm = T)
nhl.sd.win.3 <- sd(jump.frame$win.diff.3[jump.frame$League == "NHL"], na.rm = T)
nfl.sd.win.3 <- sd(jump.frame$win.diff.3[jump.frame$League == "NFL"], na.rm = T)

jump.frame$mean.win.3 <- ifelse(jump.frame$League == "MLB", mlb.mean.win.3, 
                                ifelse(jump.frame$League == "NBA", nba.mean.win.3, 
                                       ifelse(jump.frame$League == "NHL", nhl.mean.win.3,
                                              nfl.mean.win.3)))

jump.frame$sd.win.3 <- ifelse(jump.frame$League == "MLB", mlb.sd.win.3, 
                              ifelse(jump.frame$League == "NBA", nba.sd.win.3, 
                                     ifelse(jump.frame$League == "NHL", nhl.sd.win.3,
                                            nfl.sd.win.3)))

jump.frame <- mutate(jump.frame, z.win.diff.3 = (win.diff.3-mean.win.3)/sd.win.3)

jump.frame <- subset(jump.frame, select = -c(mean.win.3, sd.win.3))

Is this standardization necessary for attendance. If we look at the spread of attendance, I’d say it is quite uniform between the 4 leagues:

plot_ly(jump.frame, x = jump.frame$Team, y = jump.frame$att.diff.control , color = jump.frame$League, 
                  type = 'scatter', mode = 'marker',
                  hoverinfo = 'text', text = jump.frame$Team)

## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...

We can now calculate correlation. Is the dataset parametric? The variables look normal however we have too many missing variables. I have decided to stick with the Spearman correlation.

#Test for Normality
hist(jump.frame$z.win.diff)

hist(jump.frame$att.diff.control)

shapiro.test(jump.frame$z.win.diff)

## 
##  Shapiro-Wilk normality test
## 
## data:  jump.frame$z.win.diff
## W = 0.99903, p-value = 0.1137

shapiro.test(jump.frame$att.diff.control)

## 
##  Shapiro-Wilk normality test
## 
## data:  jump.frame$att.diff.control
## W = 0.9267, p-value < 2.2e-16

qqnorm(jump.frame$z.win.diff)

qqnorm(jump.frame$att.diff.control)

norm.test <- subset(jump.frame, select = c("z.win.diff", "att.diff.control"))
sum(complete.cases(norm.test))/3067*100

## [1] 89.37072

I have decided to only use what I consider ‘major jumps’ in winning percentage to calculate the correlation. This will highlight a fanbase’s reliance on success or failure to dictate their attendance behaviours. My cut off is anything inbetween the zscore of -0.5 0.5, which equates to around 3-7 percent jump in the NBA, MLB and NHL and a 12 percent jump in the NFl (around 50% of the data). Aything smaller that leaves too much up to random variance.

hist(jump.frame$z.win.diff)

jump.frame.big <- subset(jump.frame, z.win.diff < -.5 | z.win.diff.2 > .5)
nrow(jump.frame.big)/3067*100

## [1] 53.66808

Check for outliers - it looks like I found a mistake (I have corected this mistake earlier in the script). Also remove any teams where they have 6 or less records (not enough to prove the correlation has actual strength).

boxplot(jump.frame.big$z.win.diff)

boxplot(jump.frame.big$att.diff.control)

count.jump.frame.big <- count(jump.frame.big, 'Team')
big.list <- subset(count.jump.frame.big, freq <7, select = Team)
big.list <- big.list$Team

team.df.2 <- as.data.frame(table(jump.frame.big$Team))
team.df.2 <- team.df.2[1]
names(team.df.2)[1] <- "Team"

leaguechecking <- subset(majorfour, select = c('Team', 'League', 'City'))
leaguechecking <- unique(leaguechecking[c("Team", "League", "City")])

team.df.2 <- merge(team.df.2, leaguechecking, all.x = T)
team.df.2 <- subset(team.df.2, !(team.df.2$Team %in% big.list))

Calculate the correlation between 1 year percent difference in winning percentage and 1 year percent difference in attendance. I have also decided to convert any negative correlations to zero. For the way I am looking at this correlation (trying to find fairweather fans), a negative correlation shows the fanbase has no reliance on winning to dictate their attendance. You could argue that the fans show up more when the team loses but I’d say that is not the case.

cor.1by1.big <- function (x) {
  cor(jump.frame.big$z.win.diff[jump.frame.big$Team == x], jump.frame.big$att.diff.control[jump.frame.big$Team == x], use = "na.or.complete", method = "spearman")
}

team.df.2$cor.1by1.big <- lapply(team.df.2$Team, cor.1by1.big)
team.df.2$cor.1by1.big <- as.numeric(team.df.2$cor.1by1.big)
Cor.Team <- team.df.2[,c(1,4)]
Cor.Team <- Cor.Team[order(-Cor.Team$cor.1by1.big),]
Cor.Team[1:25,]

##                      Team cor.1by1.big
## 37         Denver Broncos    0.8225070
## 110  San Francisco Giants    0.8214286
## 25      Chicago White Sox    0.8181818
## 23          Chicago Bulls    0.8153846
## 75         Montreal Expos    0.8095238
## 9        Baltimore Ravens    0.7937200
## 6           Atlanta Hawks    0.7714286
## 40        Detroit Pistons    0.7623346
## 97  Philadelphia Phillies    0.7524757
## 112      Seattle Mariners    0.7472527
## 125       Toronto Raptors    0.7417582
## 68      Milwaukee Brewers    0.7363636
## 105      Sacramento Kings    0.7302967
## 14          Buffalo Bills    0.7174763
## 102   Pittsburgh Steelers    0.7173536
## 71        Minnesota Twins    0.7132867
## 3    Arizona Diamondbacks    0.7062937
## 131   Washington Redskins    0.7062711
## 132    Washington Wizards    0.6947368
## 126             Utah Jazz    0.6923077
## 101    Pittsburgh Pirates    0.6879289
## 29    Cleveland Cavaliers    0.6490653
## 60      Los Angeles Kings    0.6484848
## 42         Detroit Tigers    0.6484255
## 48         Houston Astros    0.6483516

###Converting negative to zero
jump.with.neg <- team.df.2 

team.df.2[team.df.2$cor.1by1.big < 0,4] <- 0

demo.jump.frame.big <- aggregate(jump.frame.big[c(12,17,19:87)], by = list(jump.frame.big$Team), mean, na.rm = T)
team.df.2 <- merge(team.df.2, demo.jump.frame.big, by.x = "Team", by.y = "Group.1", all.x = T)

Now that I have found which fanbases show the highest correlation between winning and attendance, I will now see if there are any demographic variables that show significance when predicting a highly correlated fanbase.

To begin, I will remove any NA records as well as select the predictor variables that I believe have an actual chance at predicting my outcome:

team.df.1 <- team.df.2[complete.cases(team.df.2),]

set.seed(100)

coreset <- team.df.1[,c(4:10,30,41:75)]

Split my data into training and test:

train_index <- createDataPartition(coreset$cor.1by1.big, times = 1, p = 0.7, list = F)
trainset <- coreset[train_index,]
testset <- coreset[-train_index,]

train_cor <- trainset$cor.1by1.big
test_cor <- testset$cor.1by1.big
testset<-testset[,-1]

Check for highly correlated variables and remove those that are highly correlated and may come from similar sources (Median Age instead of Male or Female median age, zscore of stadium capacity to population instead of percentage of stadium capacity to population OR percentage of attendance to population OR z score of attednance to population, payroll instead of winning percentage/payroll):

cor.test <- cor(trainset, use = "na.or.complete")
cor.test.melt <- arrange(melt(cor.test), -abs(value))

trainset.stripped <- trainset[,-c(6,7,36:38,42)]

Next I want to calculate the predictive power of each individual variable. I tried to create a function to perform this but ran into an unstoppable error so decided to do it manually.

every.lm.df <- data.frame(colnames(trainset.stripped[,-1]))
every.lm.df$pvalue <- NA
every.lm.df$rsquared <- NA
names(every.lm.df)[1] <- "variable"



###tried to build a function but did not work
fit1 <- lm(cor.1by1.big~adj.ticket.price, data = trainset.stripped)
fit2 <- lm(cor.1by1.big~Pop, data = trainset.stripped)
fit3 <- lm(cor.1by1.big~m.per.f.100, data = trainset.stripped)
fit4 <- lm(cor.1by1.big~Median.Age, data = trainset.stripped)
fit5 <- lm(cor.1by1.big~adj.median.income, data = trainset.stripped)
fit6 <- lm(cor.1by1.big~race.white, data = trainset.stripped)
fit7 <- lm(cor.1by1.big~race.black, data = trainset.stripped)
fit8 <- lm(cor.1by1.big~race.asian, data = trainset.stripped)
fit9 <- lm(cor.1by1.big~race.latin, data = trainset.stripped)
fit10 <- lm(cor.1by1.big~race.indigenous, data = trainset.stripped)
fit11 <- lm(cor.1by1.big~race.other, data = trainset.stripped)
fit12 <- lm(cor.1by1.big~race.mixed, data = trainset.stripped)
fit13 <- lm(cor.1by1.big~i.agri, data = trainset.stripped)
fit14 <- lm(cor.1by1.big~i.mining, data = trainset.stripped)
fit15 <- lm(cor.1by1.big~i.utilities, data = trainset.stripped)
fit16 <- lm(cor.1by1.big~i.construction, data = trainset.stripped)
fit17 <- lm(cor.1by1.big~i.manufacturing, data = trainset.stripped)
fit18 <- lm(cor.1by1.big~i.wholesale, data = trainset.stripped)
fit19 <- lm(cor.1by1.big~i.retail, data = trainset.stripped)
fit20 <- lm(cor.1by1.big~i.transportation, data = trainset.stripped)
fit21 <- lm(cor.1by1.big~i.info.culture, data = trainset.stripped)
fit22 <- lm(cor.1by1.big~i.finance.insur, data = trainset.stripped)
fit23 <- lm(cor.1by1.big~i.realestate, data = trainset.stripped)
fit24 <- lm(cor.1by1.big~i.sci.technical, data = trainset.stripped)
fit25 <- lm(cor.1by1.big~i.company.management, data = trainset.stripped)
fit26 <- lm(cor.1by1.big~i.admin.waste, data = trainset.stripped)
fit27 <- lm(cor.1by1.big~i.education, data = trainset.stripped)
fit28 <- lm(cor.1by1.big~i.health, data = trainset.stripped)
fit29 <- lm(cor.1by1.big~i.arts.entertain, data = trainset.stripped)
fit30 <- lm(cor.1by1.big~i.food.accommo, data = trainset.stripped)
fit31 <- lm(cor.1by1.big~i.other, data = trainset.stripped)
fit32 <- lm(cor.1by1.big~i.pub.admin, data = trainset.stripped)
fit33 <- lm(cor.1by1.big~z.pct.att.pop, data = trainset.stripped)
fit34 <- lm(cor.1by1.big~pct.tick.income, data = trainset.stripped)
fit35 <- lm(cor.1by1.big~payroll.pct, data = trainset.stripped)
fit36 <- lm(cor.1by1.big~tickprice.diff.1, data = trainset.stripped)

every.lm.df[1,2] <- anova(fit1)$'Pr(>F)'[1]
every.lm.df[2,2] <- anova(fit2)$'Pr(>F)'[1] 
every.lm.df[3,2] <- anova(fit3)$'Pr(>F)'[1] 
every.lm.df[4,2] <- anova(fit4)$'Pr(>F)'[1] 
every.lm.df[5,2] <- anova(fit5)$'Pr(>F)'[1] 
every.lm.df[6,2] <- anova(fit6)$'Pr(>F)'[1] 
every.lm.df[7,2] <- anova(fit7)$'Pr(>F)'[1] 
every.lm.df[8,2] <- anova(fit8)$'Pr(>F)'[1] 
every.lm.df[9,2] <- anova(fit9)$'Pr(>F)'[1] 
every.lm.df[10,2] <- anova(fit10)$'Pr(>F)'[1] 
every.lm.df[11,2] <- anova(fit11)$'Pr(>F)'[1] 
every.lm.df[12,2] <- anova(fit12)$'Pr(>F)'[1] 
every.lm.df[13,2] <- anova(fit13)$'Pr(>F)'[1] 
every.lm.df[14,2] <- anova(fit14)$'Pr(>F)'[1] 
every.lm.df[15,2] <- anova(fit15)$'Pr(>F)'[1] 
every.lm.df[16,2] <- anova(fit16)$'Pr(>F)'[1] 
every.lm.df[17,2] <- anova(fit17)$'Pr(>F)'[1] 
every.lm.df[18,2] <- anova(fit18)$'Pr(>F)'[1] 
every.lm.df[19,2] <- anova(fit19)$'Pr(>F)'[1]
every.lm.df[20,2] <- anova(fit20)$'Pr(>F)'[1] 
every.lm.df[21,2] <- anova(fit21)$'Pr(>F)'[1] 
every.lm.df[22,2] <- anova(fit22)$'Pr(>F)'[1]
every.lm.df[23,2] <- anova(fit23)$'Pr(>F)'[1] 
every.lm.df[24,2] <- anova(fit24)$'Pr(>F)'[1] 
every.lm.df[25,2] <- anova(fit25)$'Pr(>F)'[1] 
every.lm.df[26,2] <- anova(fit26)$'Pr(>F)'[1] 
every.lm.df[27,2] <- anova(fit27)$'Pr(>F)'[1] 
every.lm.df[28,2] <- anova(fit28)$'Pr(>F)'[1] 
every.lm.df[29,2] <- anova(fit29)$'Pr(>F)'[1] 
every.lm.df[30,2] <- anova(fit30)$'Pr(>F)'[1] 
every.lm.df[31,2] <- anova(fit31)$'Pr(>F)'[1] 
every.lm.df[32,2] <- anova(fit32)$'Pr(>F)'[1] 
every.lm.df[33,2] <- anova(fit33)$'Pr(>F)'[1] 
every.lm.df[34,2] <- anova(fit34)$'Pr(>F)'[1] 
every.lm.df[35,2] <- anova(fit35)$'Pr(>F)'[1] 
every.lm.df[36,2] <- anova(fit36)$'Pr(>F)'[1]

every.lm.df[1,3] <- summary(fit1)$r.squared
every.lm.df[2,3] <- summary(fit2)$r.squared 
every.lm.df[3,3] <- summary(fit3)$r.squared
every.lm.df[4,3] <- summary(fit4)$r.squared
every.lm.df[5,3] <- summary(fit5)$r.squared 
every.lm.df[6,3] <- summary(fit6)$r.squared
every.lm.df[7,3] <- summary(fit7)$r.squared 
every.lm.df[8,3] <- summary(fit8)$r.squared
every.lm.df[9,3] <- summary(fit9)$r.squared 
every.lm.df[10,3] <- summary(fit10)$r.squared 
every.lm.df[11,3] <- summary(fit11)$r.squared 
every.lm.df[12,3] <- summary(fit12)$r.squared 
every.lm.df[13,3] <- summary(fit13)$r.squared 
every.lm.df[14,3] <- summary(fit14)$r.squared 
every.lm.df[15,3] <- summary(fit15)$r.squared 
every.lm.df[16,3] <- summary(fit16)$r.squared 
every.lm.df[17,3] <- summary(fit17)$r.squared 
every.lm.df[18,3] <- summary(fit18)$r.squared 
every.lm.df[19,3] <- summary(fit19)$r.squared
every.lm.df[20,3] <- summary(fit20)$r.squared 
every.lm.df[21,3] <- summary(fit21)$r.squared 
every.lm.df[22,3] <- summary(fit22)$r.squared
every.lm.df[23,3] <- summary(fit23)$r.squared 
every.lm.df[24,3] <- summary(fit24)$r.squared 
every.lm.df[25,3] <- summary(fit25)$r.squared 
every.lm.df[26,3] <- summary(fit26)$r.squared 
every.lm.df[27,3] <- summary(fit27)$r.squared 
every.lm.df[28,3] <- summary(fit28)$r.squared 
every.lm.df[29,3] <- summary(fit29)$r.squared 
every.lm.df[30,3] <- summary(fit30)$r.squared 
every.lm.df[31,3] <- summary(fit31)$r.squared 
every.lm.df[32,3] <-summary(fit32)$r.squared 
every.lm.df[33,3] <- summary(fit33)$r.squared
every.lm.df[34,3] <- summary(fit34)$r.squared 
every.lm.df[35,3] <- summary(fit35)$r.squared 
every.lm.df[36,3] <- summary(fit36)$r.squared

every.lm.df

##                variable      pvalue     rsquared
## 1      adj.ticket.price 0.021415679 6.857330e-02
## 2                   Pop 0.997585660 1.229001e-07
## 3           m.per.f.100 0.308788477 1.380741e-02
## 4            Median.Age 0.248751455 1.769735e-02
## 5     adj.median.income 0.594349295 3.799423e-03
## 6            race.white 0.034999801 5.791858e-02
## 7            race.black 0.079271655 4.049459e-02
## 8            race.asian 0.899033501 2.160675e-04
## 9            race.latin 0.046091955 5.199573e-02
## 10      race.indigenous 0.272217101 1.605103e-02
## 11           race.other 0.932261346 9.697310e-05
## 12           race.mixed 0.547682974 4.840266e-03
## 13               i.agri 0.920576719 1.334316e-04
## 14             i.mining 0.431710654 8.263655e-03
## 15          i.utilities 0.009622175 8.604490e-02
## 16       i.construction 0.122715154 3.147033e-02
## 17      i.manufacturing 0.150681855 2.734084e-02
## 18          i.wholesale 0.392560711 9.763767e-03
## 19             i.retail 0.296353347 1.453096e-02
## 20     i.transportation 0.992649420 1.139224e-06
## 21       i.info.culture 0.797652386 8.819269e-04
## 22      i.finance.insur 0.381409197 1.023097e-02
## 23         i.realestate 0.200625595 2.174345e-02
## 24      i.sci.technical 0.631987973 3.074044e-03
## 25 i.company.management 0.963841819 2.758408e-05
## 26        i.admin.waste 0.638914245 2.950801e-03
## 27          i.education 0.874746587 3.334660e-04
## 28             i.health 0.291008542 1.485388e-02
## 29     i.arts.entertain 0.474049379 6.855693e-03
## 30       i.food.accommo 0.319584185 1.320887e-02
## 31              i.other 0.201109952 2.169724e-02
## 32          i.pub.admin 0.627171216 3.161577e-03
## 33        z.pct.att.pop 0.352958892 1.151395e-02
## 34      pct.tick.income 0.048927385 5.071755e-02
## 35          payroll.pct 0.953123133 4.638324e-05
## 36     tickprice.diff.1 0.729162430 1.607824e-03

As you see, none of the variables show any predictive power by themselves. Let’s see if we can fit a working model based on multiple variables:

full <- lm(cor.1by1.big~., data = trainset.stripped)

step <- stepAIC(full, direction = "both", trace = F)
step$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## Final Model:
## cor.1by1.big ~ adj.ticket.price + adj.median.income + race.white + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.utilities + i.construction + i.wholesale + i.transportation + 
##     i.realestate + i.company.management + i.health + i.arts.entertain + 
##     z.pct.att.pop + payroll.pct + Pop
## 
## 
##                  Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                            41   1.797100 -217.3376
## 2       - i.pub.admin  0 0.000000e+00        41   1.797100 -217.3376
## 3    - i.food.accommo  1 6.840518e-05        42   1.797169 -219.3347
## 4          - i.retail  1 6.662072e-05        43   1.797235 -221.3318
## 5        - race.asian  1 1.558693e-03        44   1.798794 -223.2651
## 6   - i.sci.technical  1 2.184344e-03        45   1.800978 -225.1716
## 7           - i.other  1 5.315784e-03        46   1.806294 -226.9447
## 8       - m.per.f.100  1 6.320710e-03        47   1.812615 -228.6757
## 9   - pct.tick.income  1 5.580834e-03        48   1.818196 -230.4390
## 10   - i.info.culture  1 5.490905e-03        49   1.823686 -232.2068
## 11  - i.finance.insur  1 1.152619e-02        50   1.835213 -233.7217
## 12  - i.manufacturing  1 3.501397e-03        51   1.838714 -235.5749
## 13 - tickprice.diff.1  1 1.614673e-02        52   1.854861 -236.9017
## 14      - i.education  1 2.433345e-02        53   1.879194 -237.8981
## 15       - race.black  1 1.601560e-02        54   1.895210 -239.2446
## 16         - i.mining  1 3.366936e-02        55   1.928879 -239.8887
## 17              - Pop  1 2.477971e-02        56   1.953659 -240.9058
## 18       - Median.Age  1 3.330656e-02        57   1.986965 -241.6042
## 19    - i.admin.waste  1 3.841679e-02        58   2.025382 -242.1296
## 20              + Pop  1 5.750113e-02        57   1.967881 -242.3473

summary(step)

## 
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + adj.median.income + 
##     race.white + race.latin + race.indigenous + race.other + 
##     race.mixed + i.agri + i.utilities + i.construction + i.wholesale + 
##     i.transportation + i.realestate + i.company.management + 
##     i.health + i.arts.entertain + z.pct.att.pop + payroll.pct + 
##     Pop, data = trainset.stripped)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30511 -0.10187  0.00887  0.13254  0.34517 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.496e-01  6.499e-01   0.999 0.321782    
## adj.ticket.price     -6.597e-03  1.676e-03  -3.935 0.000228 ***
## adj.median.income    -1.449e-05  4.785e-06  -3.029 0.003687 ** 
## race.white            7.505e-03  3.171e-03   2.366 0.021384 *  
## race.latin           -9.381e-03  3.168e-03  -2.961 0.004469 ** 
## race.indigenous      -3.703e-01  8.402e-02  -4.407 4.69e-05 ***
## race.other            5.126e-02  9.362e-03   5.476 1.02e-06 ***
## race.mixed            5.327e-02  4.076e-02   1.307 0.196486    
## i.agri                5.018e-01  2.894e-01   1.734 0.088299 .  
## i.utilities           7.344e-01  1.679e-01   4.374 5.25e-05 ***
## i.construction       -9.079e-02  2.483e-02  -3.657 0.000559 ***
## i.wholesale          -1.758e-01  7.990e-02  -2.200 0.031873 *  
## i.transportation      6.390e-02  2.804e-02   2.279 0.026421 *  
## i.realestate          2.569e-01  9.767e-02   2.630 0.010952 *  
## i.company.management  1.661e+00  6.002e-01   2.767 0.007617 ** 
## i.health             -2.924e-02  1.804e-02  -1.621 0.110578    
## i.arts.entertain     -1.105e-01  4.492e-02  -2.460 0.016929 *  
## z.pct.att.pop         5.615e-02  3.635e-02   1.545 0.127907    
## payroll.pct           4.149e-03  1.222e-03   3.396 0.001252 ** 
## Pop                  -2.802e-08  2.171e-08  -1.291 0.202069    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1858 on 57 degrees of freedom
## Multiple R-squared:  0.5572, Adjusted R-squared:  0.4096 
## F-statistic: 3.775 on 19 and 57 DF,  p-value: 5.178e-05

step.for <- stepAIC(full, direction = "forward", trace = F)
step.for$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## Final Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev       AIC
## 1                         41     1.7971 -217.3376

summary(step.for)

## 
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + 
##     Median.Age + adj.median.income + race.white + race.black + 
##     race.asian + race.latin + race.indigenous + race.other + 
##     race.mixed + i.agri + i.mining + i.utilities + i.construction + 
##     i.manufacturing + i.wholesale + i.retail + i.transportation + 
##     i.info.culture + i.finance.insur + i.realestate + i.sci.technical + 
##     i.company.management + i.admin.waste + i.education + i.health + 
##     i.arts.entertain + i.food.accommo + i.other + i.pub.admin + 
##     z.pct.att.pop + pct.tick.income + payroll.pct + tickprice.diff.1, 
##     data = trainset.stripped)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38061 -0.12110 -0.00275  0.12472  0.32482 
## 
## Coefficients: (1 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)  
## (Intercept)          -4.452e+00  1.290e+01  -0.345   0.7318  
## adj.ticket.price     -8.559e-03  6.098e-03  -1.403   0.1680  
## Pop                  -4.209e-08  6.137e-08  -0.686   0.4967  
## m.per.f.100          -8.911e-03  3.551e-02  -0.251   0.8031  
## Median.Age            4.028e-02  7.441e-02   0.541   0.5912  
## adj.median.income    -1.404e-05  1.462e-05  -0.960   0.3425  
## race.white            4.370e-02  1.348e-01   0.324   0.7475  
## race.black            3.483e-02  1.328e-01   0.262   0.7945  
## race.asian            2.738e-02  1.403e-01   0.195   0.8462  
## race.latin           -1.023e-02  8.699e-03  -1.176   0.2462  
## race.indigenous      -3.819e-01  1.929e-01  -1.980   0.0545 .
## race.other            8.903e-02  1.186e-01   0.751   0.4571  
## race.mixed            1.332e-01  1.384e-01   0.963   0.3413  
## i.agri                6.868e-01  4.665e-01   1.472   0.1486  
## i.mining              9.595e-02  1.146e-01   0.837   0.4074  
## i.utilities           6.647e-01  3.534e-01   1.881   0.0671 .
## i.construction       -8.453e-02  6.511e-02  -1.298   0.2015  
## i.manufacturing       2.683e-02  4.173e-02   0.643   0.5238  
## i.wholesale          -2.604e-01  1.469e-01  -1.772   0.0838 .
## i.retail             -3.072e-03  7.626e-02  -0.040   0.9681  
## i.transportation      9.827e-02  6.857e-02   1.433   0.1594  
## i.info.culture        4.280e-02  8.757e-02   0.489   0.6276  
## i.finance.insur       2.713e-02  4.645e-02   0.584   0.5624  
## i.realestate          3.086e-01  2.743e-01   1.125   0.2672  
## i.sci.technical       1.026e-02  4.637e-02   0.221   0.8260  
## i.company.management  1.656e+00  1.282e+00   1.292   0.2036  
## i.admin.waste         5.166e-02  1.047e-01   0.493   0.6244  
## i.education           2.439e-02  5.278e-02   0.462   0.6464  
## i.health             -3.396e-02  3.275e-02  -1.037   0.3058  
## i.arts.entertain     -1.147e-01  1.107e-01  -1.036   0.3062  
## i.food.accommo       -2.456e-03  6.217e-02  -0.040   0.9687  
## i.other              -2.061e-02  1.045e-01  -0.197   0.8446  
## i.pub.admin                  NA         NA      NA       NA  
## z.pct.att.pop         4.123e-02  8.268e-02   0.499   0.6207  
## pct.tick.income       1.192e+00  3.158e+00   0.377   0.7079  
## payroll.pct           4.102e-03  1.650e-03   2.486   0.0171 *
## tickprice.diff.1      4.702e-03  8.353e-03   0.563   0.5766  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2094 on 41 degrees of freedom
## Multiple R-squared:  0.5956, Adjusted R-squared:  0.2505 
## F-statistic: 1.726 on 35 and 41 DF,  p-value: 0.0469

step.back<- stepAIC(full, direction = "backward", trace = F)
step.back$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## Final Model:
## cor.1by1.big ~ adj.ticket.price + adj.median.income + race.white + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.utilities + i.construction + i.wholesale + i.transportation + 
##     i.realestate + i.company.management + i.health + i.arts.entertain + 
##     z.pct.att.pop + payroll.pct
## 
## 
##                  Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                            41   1.797100 -217.3376
## 2       - i.pub.admin  0 0.000000e+00        41   1.797100 -217.3376
## 3    - i.food.accommo  1 6.840518e-05        42   1.797169 -219.3347
## 4          - i.retail  1 6.662072e-05        43   1.797235 -221.3318
## 5        - race.asian  1 1.558693e-03        44   1.798794 -223.2651
## 6   - i.sci.technical  1 2.184344e-03        45   1.800978 -225.1716
## 7           - i.other  1 5.315784e-03        46   1.806294 -226.9447
## 8       - m.per.f.100  1 6.320710e-03        47   1.812615 -228.6757
## 9   - pct.tick.income  1 5.580834e-03        48   1.818196 -230.4390
## 10   - i.info.culture  1 5.490905e-03        49   1.823686 -232.2068
## 11  - i.finance.insur  1 1.152619e-02        50   1.835213 -233.7217
## 12  - i.manufacturing  1 3.501397e-03        51   1.838714 -235.5749
## 13 - tickprice.diff.1  1 1.614673e-02        52   1.854861 -236.9017
## 14      - i.education  1 2.433345e-02        53   1.879194 -237.8981
## 15       - race.black  1 1.601560e-02        54   1.895210 -239.2446
## 16         - i.mining  1 3.366936e-02        55   1.928879 -239.8887
## 17              - Pop  1 2.477971e-02        56   1.953659 -240.9058
## 18       - Median.Age  1 3.330656e-02        57   1.986965 -241.6042
## 19    - i.admin.waste  1 3.841679e-02        58   2.025382 -242.1296

summary(step.back)

## 
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + adj.median.income + 
##     race.white + race.latin + race.indigenous + race.other + 
##     race.mixed + i.agri + i.utilities + i.construction + i.wholesale + 
##     i.transportation + i.realestate + i.company.management + 
##     i.health + i.arts.entertain + z.pct.att.pop + payroll.pct, 
##     data = trainset.stripped)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34496 -0.10418 -0.01453  0.11595  0.39372 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.065e+00  5.678e-01   1.876 0.065750 .  
## adj.ticket.price     -6.738e-03  1.682e-03  -4.005 0.000179 ***
## adj.median.income    -1.582e-05  4.700e-06  -3.366 0.001357 ** 
## race.white            7.526e-03  3.189e-03   2.360 0.021683 *  
## race.latin           -9.565e-03  3.183e-03  -3.005 0.003922 ** 
## race.indigenous      -3.645e-01  8.438e-02  -4.320 6.18e-05 ***
## race.other            4.531e-02  8.192e-03   5.531 7.98e-07 ***
## race.mixed            6.981e-02  3.892e-02   1.794 0.078054 .  
## i.agri                3.958e-01  2.790e-01   1.418 0.161465    
## i.utilities           7.299e-01  1.688e-01   4.323 6.12e-05 ***
## i.construction       -8.002e-02  2.352e-02  -3.402 0.001216 ** 
## i.wholesale          -1.740e-01  8.034e-02  -2.166 0.034443 *  
## i.transportation      5.256e-02  2.678e-02   1.963 0.054458 .  
## i.realestate          1.777e-01  7.640e-02   2.326 0.023564 *  
## i.company.management  1.480e+00  5.869e-01   2.521 0.014466 *  
## i.health             -4.164e-02  1.535e-02  -2.713 0.008773 ** 
## i.arts.entertain     -1.073e-01  4.511e-02  -2.379 0.020670 *  
## z.pct.att.pop         5.918e-02  3.648e-02   1.623 0.110115    
## payroll.pct           4.114e-03  1.228e-03   3.349 0.001429 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1869 on 58 degrees of freedom
## Multiple R-squared:  0.5443, Adjusted R-squared:  0.4029 
## F-statistic: 3.848 on 18 and 58 DF,  p-value: 4.789e-05

While it looks like the backwards and stepwise regression models show some significance, I still have my concerns. The forward selection kept all the variables so I will not try using that model. Let’s test our backward and stepwise regression models on my test set.

performance <- as.data.frame(test_cor)

performance$step.back <- predict(step.back, testset)
performance$step <- predict(step, testset)

performance <- performance[order(performance$test_cor),]

plot(performance$test_cor)
lines(performance$step.back,type='l',col='blue')
lines(performance$step,type='l',col='red')

Yikes! That does not look like a well predicting model. Looks like it has overfit to the training set. I will try a new training set to test:

set.seed(66)

train_index <- createDataPartition(coreset$cor.1by1.big, times = 1, p = 0.7, list = F)
trainset <- coreset[train_index,]
testset <- coreset[-train_index,]

train_cor <- trainset$cor.1by1.big
test_cor <- testset$cor.1by1.big
testset<-testset[,-1]

cor.test <- cor(trainset, use = "na.or.complete")
cor.test.melt <- arrange(melt(cor.test), -abs(value))

trainset.stripped <- trainset[,-c(6,7,36:38,42)]

every.lm.df <- data.frame(colnames(trainset.stripped[,-1]))
every.lm.df$pvalue <- NA
every.lm.df$rsquared <- NA
names(every.lm.df)[1] <- "variable"

###tried to build a function but did not work
fit1 <- lm(cor.1by1.big~adj.ticket.price, data = trainset.stripped)
fit2 <- lm(cor.1by1.big~Pop, data = trainset.stripped)
fit3 <- lm(cor.1by1.big~m.per.f.100, data = trainset.stripped)
fit4 <- lm(cor.1by1.big~Median.Age, data = trainset.stripped)
fit5 <- lm(cor.1by1.big~adj.median.income, data = trainset.stripped)
fit6 <- lm(cor.1by1.big~race.white, data = trainset.stripped)
fit7 <- lm(cor.1by1.big~race.black, data = trainset.stripped)
fit8 <- lm(cor.1by1.big~race.asian, data = trainset.stripped)
fit9 <- lm(cor.1by1.big~race.latin, data = trainset.stripped)
fit10 <- lm(cor.1by1.big~race.indigenous, data = trainset.stripped)
fit11 <- lm(cor.1by1.big~race.other, data = trainset.stripped)
fit12 <- lm(cor.1by1.big~race.mixed, data = trainset.stripped)
fit13 <- lm(cor.1by1.big~i.agri, data = trainset.stripped)
fit14 <- lm(cor.1by1.big~i.mining, data = trainset.stripped)
fit15 <- lm(cor.1by1.big~i.utilities, data = trainset.stripped)
fit16 <- lm(cor.1by1.big~i.construction, data = trainset.stripped)
fit17 <- lm(cor.1by1.big~i.manufacturing, data = trainset.stripped)
fit18 <- lm(cor.1by1.big~i.wholesale, data = trainset.stripped)
fit19 <- lm(cor.1by1.big~i.retail, data = trainset.stripped)
fit20 <- lm(cor.1by1.big~i.transportation, data = trainset.stripped)
fit21 <- lm(cor.1by1.big~i.info.culture, data = trainset.stripped)
fit22 <- lm(cor.1by1.big~i.finance.insur, data = trainset.stripped)
fit23 <- lm(cor.1by1.big~i.realestate, data = trainset.stripped)
fit24 <- lm(cor.1by1.big~i.sci.technical, data = trainset.stripped)
fit25 <- lm(cor.1by1.big~i.company.management, data = trainset.stripped)
fit26 <- lm(cor.1by1.big~i.admin.waste, data = trainset.stripped)
fit27 <- lm(cor.1by1.big~i.education, data = trainset.stripped)
fit28 <- lm(cor.1by1.big~i.health, data = trainset.stripped)
fit29 <- lm(cor.1by1.big~i.arts.entertain, data = trainset.stripped)
fit30 <- lm(cor.1by1.big~i.food.accommo, data = trainset.stripped)
fit31 <- lm(cor.1by1.big~i.other, data = trainset.stripped)
fit32 <- lm(cor.1by1.big~i.pub.admin, data = trainset.stripped)
fit33 <- lm(cor.1by1.big~z.pct.att.pop, data = trainset.stripped)
fit34 <- lm(cor.1by1.big~pct.tick.income, data = trainset.stripped)
fit35 <- lm(cor.1by1.big~payroll.pct, data = trainset.stripped)
fit36 <- lm(cor.1by1.big~tickprice.diff.1, data = trainset.stripped)

every.lm.df[1,2] <- anova(fit1)$'Pr(>F)'[1]
every.lm.df[2,2] <- anova(fit2)$'Pr(>F)'[1] 
every.lm.df[3,2] <- anova(fit3)$'Pr(>F)'[1] 
every.lm.df[4,2] <- anova(fit4)$'Pr(>F)'[1] 
every.lm.df[5,2] <- anova(fit5)$'Pr(>F)'[1] 
every.lm.df[6,2] <- anova(fit6)$'Pr(>F)'[1] 
every.lm.df[7,2] <- anova(fit7)$'Pr(>F)'[1] 
every.lm.df[8,2] <- anova(fit8)$'Pr(>F)'[1] 
every.lm.df[9,2] <- anova(fit9)$'Pr(>F)'[1] 
every.lm.df[10,2] <- anova(fit10)$'Pr(>F)'[1] 
every.lm.df[11,2] <- anova(fit11)$'Pr(>F)'[1] 
every.lm.df[12,2] <- anova(fit12)$'Pr(>F)'[1] 
every.lm.df[13,2] <- anova(fit13)$'Pr(>F)'[1] 
every.lm.df[14,2] <- anova(fit14)$'Pr(>F)'[1] 
every.lm.df[15,2] <- anova(fit15)$'Pr(>F)'[1] 
every.lm.df[16,2] <- anova(fit16)$'Pr(>F)'[1] 
every.lm.df[17,2] <- anova(fit17)$'Pr(>F)'[1] 
every.lm.df[18,2] <- anova(fit18)$'Pr(>F)'[1] 
every.lm.df[19,2] <- anova(fit19)$'Pr(>F)'[1]
every.lm.df[20,2] <- anova(fit20)$'Pr(>F)'[1] 
every.lm.df[21,2] <- anova(fit21)$'Pr(>F)'[1] 
every.lm.df[22,2] <- anova(fit22)$'Pr(>F)'[1]
every.lm.df[23,2] <- anova(fit23)$'Pr(>F)'[1] 
every.lm.df[24,2] <- anova(fit24)$'Pr(>F)'[1] 
every.lm.df[25,2] <- anova(fit25)$'Pr(>F)'[1] 
every.lm.df[26,2] <- anova(fit26)$'Pr(>F)'[1] 
every.lm.df[27,2] <- anova(fit27)$'Pr(>F)'[1] 
every.lm.df[28,2] <- anova(fit28)$'Pr(>F)'[1] 
every.lm.df[29,2] <- anova(fit29)$'Pr(>F)'[1] 
every.lm.df[30,2] <- anova(fit30)$'Pr(>F)'[1] 
every.lm.df[31,2] <- anova(fit31)$'Pr(>F)'[1] 
every.lm.df[32,2] <- anova(fit32)$'Pr(>F)'[1] 
every.lm.df[33,2] <- anova(fit33)$'Pr(>F)'[1] 
every.lm.df[34,2] <- anova(fit34)$'Pr(>F)'[1] 
every.lm.df[35,2] <- anova(fit35)$'Pr(>F)'[1] 
every.lm.df[36,2] <- anova(fit36)$'Pr(>F)'[1]

every.lm.df[1,3] <- summary(fit1)$r.squared
every.lm.df[2,3] <- summary(fit2)$r.squared 
every.lm.df[3,3] <- summary(fit3)$r.squared
every.lm.df[4,3] <- summary(fit4)$r.squared
every.lm.df[5,3] <- summary(fit5)$r.squared 
every.lm.df[6,3] <- summary(fit6)$r.squared
every.lm.df[7,3] <- summary(fit7)$r.squared 
every.lm.df[8,3] <- summary(fit8)$r.squared
every.lm.df[9,3] <- summary(fit9)$r.squared 
every.lm.df[10,3] <- summary(fit10)$r.squared 
every.lm.df[11,3] <- summary(fit11)$r.squared 
every.lm.df[12,3] <- summary(fit12)$r.squared 
every.lm.df[13,3] <- summary(fit13)$r.squared 
every.lm.df[14,3] <- summary(fit14)$r.squared 
every.lm.df[15,3] <- summary(fit15)$r.squared 
every.lm.df[16,3] <- summary(fit16)$r.squared 
every.lm.df[17,3] <- summary(fit17)$r.squared 
every.lm.df[18,3] <- summary(fit18)$r.squared 
every.lm.df[19,3] <- summary(fit19)$r.squared
every.lm.df[20,3] <- summary(fit20)$r.squared 
every.lm.df[21,3] <- summary(fit21)$r.squared 
every.lm.df[22,3] <- summary(fit22)$r.squared
every.lm.df[23,3] <- summary(fit23)$r.squared 
every.lm.df[24,3] <- summary(fit24)$r.squared 
every.lm.df[25,3] <- summary(fit25)$r.squared 
every.lm.df[26,3] <- summary(fit26)$r.squared 
every.lm.df[27,3] <- summary(fit27)$r.squared 
every.lm.df[28,3] <- summary(fit28)$r.squared 
every.lm.df[29,3] <- summary(fit29)$r.squared 
every.lm.df[30,3] <- summary(fit30)$r.squared 
every.lm.df[31,3] <- summary(fit31)$r.squared 
every.lm.df[32,3] <-summary(fit32)$r.squared 
every.lm.df[33,3] <- summary(fit33)$r.squared
every.lm.df[34,3] <- summary(fit34)$r.squared 
every.lm.df[35,3] <- summary(fit35)$r.squared 
every.lm.df[36,3] <- summary(fit36)$r.squared

full <- lm(cor.1by1.big~., data = trainset.stripped)

step <- stepAIC(full, direction = "both", trace = F)
step$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## Final Model:
## cor.1by1.big ~ Pop + race.white + race.black + race.asian + race.indigenous + 
##     race.other + i.transportation + i.admin.waste + i.education + 
##     i.other + pct.tick.income + payroll.pct
## 
## 
##                      Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                                41   2.142817 -203.7897
## 2           - i.pub.admin  0 0.000000e+00        41   2.142817 -203.7897
## 3            - race.mixed  1 8.698656e-05        42   2.142904 -205.7866
## 4      - adj.ticket.price  1 4.839874e-04        43   2.143388 -207.7692
## 5           - i.utilities  1 1.319939e-03        44   2.144708 -209.7218
## 6       - i.finance.insur  1 2.513822e-03        45   2.147221 -211.6316
## 7        - i.info.culture  1 4.937379e-03        46   2.152159 -213.4547
## 8          - i.realestate  1 1.324748e-02        47   2.165406 -214.9822
## 9              - i.health  1 2.011350e-02        48   2.185520 -216.2703
## 10       - i.food.accommo  1 1.308001e-02        49   2.198600 -217.8108
## 11           - race.latin  1 1.419612e-02        50   2.212796 -219.3152
## 12             - i.mining  1 2.950924e-02        51   2.242305 -220.2952
## 13     - tickprice.diff.1  1 2.782312e-02        52   2.270128 -221.3456
## 14        - z.pct.att.pop  1 4.543336e-02        53   2.315562 -221.8198
## 15           - Median.Age  1 4.422811e-02        54   2.359790 -222.3629
## 16 - i.company.management  1 1.699285e-02        55   2.376782 -223.8104
## 17    - adj.median.income  1 4.901062e-02        56   2.425793 -224.2388
## 18             - i.retail  1 3.501037e-02        57   2.460803 -225.1354
## 19      - i.manufacturing  1 4.638659e-02        58   2.507190 -225.6975
## 20     - i.arts.entertain  1 1.861312e-02        59   2.525803 -227.1280
## 21       - i.construction  1 1.590265e-02        60   2.541706 -228.6447
## 22      - i.sci.technical  1 1.631322e-02        61   2.558019 -230.1521
## 23          - m.per.f.100  1 3.405628e-02        62   2.592075 -231.1337
## 24               - i.agri  1 4.710081e-02        63   2.639176 -231.7471
## 25          - i.wholesale  1 3.196753e-02        64   2.671144 -232.8200

summary(step)

## 
## Call:
## lm(formula = cor.1by1.big ~ Pop + race.white + race.black + race.asian + 
##     race.indigenous + race.other + i.transportation + i.admin.waste + 
##     i.education + i.other + pct.tick.income + payroll.pct, data = trainset.stripped)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49401 -0.13496  0.01773  0.13799  0.38065 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.648e+01  4.460e+00   3.694 0.000459 ***
## Pop              -7.575e-08  2.035e-08  -3.722 0.000419 ***
## race.white       -1.575e-01  4.398e-02  -3.582 0.000659 ***
## race.black       -1.424e-01  4.258e-02  -3.344 0.001384 ** 
## race.asian       -1.757e-01  4.827e-02  -3.639 0.000548 ***
## race.indigenous  -1.175e-01  7.672e-02  -1.532 0.130418    
## race.other       -1.266e-01  4.345e-02  -2.914 0.004909 ** 
## i.transportation  4.262e-02  2.465e-02   1.729 0.088644 .  
## i.admin.waste    -1.370e-01  4.919e-02  -2.784 0.007046 ** 
## i.education      -3.814e-02  1.960e-02  -1.946 0.056077 .  
## i.other          -7.303e-02  2.783e-02  -2.624 0.010852 *  
## pct.tick.income  -1.878e+00  7.188e-01  -2.612 0.011189 *  
## payroll.pct       1.806e-03  1.204e-03   1.500 0.138408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2043 on 64 degrees of freedom
## Multiple R-squared:  0.4353, Adjusted R-squared:  0.3294 
## F-statistic: 4.111 on 12 and 64 DF,  p-value: 9.588e-05

step.for <- stepAIC(full, direction = "forward", trace = F)
step.for$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## Final Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev       AIC
## 1                         41   2.142817 -203.7897

summary(step.for)

## 
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + 
##     Median.Age + adj.median.income + race.white + race.black + 
##     race.asian + race.latin + race.indigenous + race.other + 
##     race.mixed + i.agri + i.mining + i.utilities + i.construction + 
##     i.manufacturing + i.wholesale + i.retail + i.transportation + 
##     i.info.culture + i.finance.insur + i.realestate + i.sci.technical + 
##     i.company.management + i.admin.waste + i.education + i.health + 
##     i.arts.entertain + i.food.accommo + i.other + i.pub.admin + 
##     z.pct.att.pop + pct.tick.income + payroll.pct + tickprice.diff.1, 
##     data = trainset.stripped)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44126 -0.11706  0.01704  0.10725  0.38607 
## 
## Coefficients: (1 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           1.669e+01  1.304e+01   1.279   0.2079  
## adj.ticket.price      9.994e-04  9.691e-03   0.103   0.9184  
## Pop                  -1.530e-07  7.441e-08  -2.056   0.0462 *
## m.per.f.100           2.075e-02  4.408e-02   0.471   0.6403  
## Median.Age            4.790e-02  7.280e-02   0.658   0.5142  
## adj.median.income     1.703e-05  2.201e-05   0.774   0.4436  
## race.white           -1.815e-01  1.355e-01  -1.339   0.1878  
## race.black           -1.575e-01  1.331e-01  -1.183   0.2436  
## race.asian           -2.289e-01  1.501e-01  -1.525   0.1349  
## race.latin            1.010e-02  1.237e-02   0.817   0.4189  
## race.indigenous      -3.275e-01  2.903e-01  -1.128   0.2658  
## race.other           -1.197e-01  1.249e-01  -0.958   0.3436  
## race.mixed            6.303e-03  1.545e-01   0.041   0.9677  
## i.agri                4.503e-01  7.793e-01   0.578   0.5665  
## i.mining              1.522e-01  2.142e-01   0.710   0.4815  
## i.utilities          -7.970e-02  6.682e-01  -0.119   0.9056  
## i.construction       -1.200e-01  9.228e-02  -1.300   0.2008  
## i.manufacturing      -3.453e-02  6.010e-02  -0.575   0.5688  
## i.wholesale          -3.711e-01  2.074e-01  -1.790   0.0809 .
## i.retail              1.264e-01  8.144e-02   1.552   0.1282  
## i.transportation      5.291e-02  6.984e-02   0.758   0.4530  
## i.info.culture        2.780e-02  9.274e-02   0.300   0.7659  
## i.finance.insur       1.365e-02  5.557e-02   0.246   0.8072  
## i.realestate          1.443e-01  3.097e-01   0.466   0.6438  
## i.sci.technical      -8.981e-02  5.961e-02  -1.507   0.1396  
## i.company.management  1.339e+00  1.462e+00   0.916   0.3651  
## i.admin.waste        -2.614e-01  1.389e-01  -1.882   0.0669 .
## i.education          -6.396e-02  7.001e-02  -0.914   0.3663  
## i.health             -2.188e-02  4.724e-02  -0.463   0.6457  
## i.arts.entertain     -1.158e-01  1.489e-01  -0.777   0.4414  
## i.food.accommo       -4.025e-02  6.610e-02  -0.609   0.5459  
## i.other              -1.253e-01  1.207e-01  -1.038   0.3052  
## i.pub.admin                  NA         NA      NA       NA  
## z.pct.att.pop         3.995e-02  6.741e-02   0.593   0.5567  
## pct.tick.income      -2.874e+00  4.918e+00  -0.584   0.5622  
## payroll.pct           2.323e-03  1.808e-03   1.285   0.2060  
## tickprice.diff.1      6.687e-03  7.755e-03   0.862   0.3936  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2286 on 41 degrees of freedom
## Multiple R-squared:  0.547,  Adjusted R-squared:  0.1603 
## F-statistic: 1.414 on 35 and 41 DF,  p-value: 0.1425

step.back<- stepAIC(full, direction = "backward", trace = F)
step.back$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## Final Model:
## cor.1by1.big ~ Pop + race.white + race.black + race.asian + race.indigenous + 
##     race.other + i.transportation + i.admin.waste + i.education + 
##     i.other + pct.tick.income + payroll.pct
## 
## 
##                      Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                                41   2.142817 -203.7897
## 2           - i.pub.admin  0 0.000000e+00        41   2.142817 -203.7897
## 3            - race.mixed  1 8.698656e-05        42   2.142904 -205.7866
## 4      - adj.ticket.price  1 4.839874e-04        43   2.143388 -207.7692
## 5           - i.utilities  1 1.319939e-03        44   2.144708 -209.7218
## 6       - i.finance.insur  1 2.513822e-03        45   2.147221 -211.6316
## 7        - i.info.culture  1 4.937379e-03        46   2.152159 -213.4547
## 8          - i.realestate  1 1.324748e-02        47   2.165406 -214.9822
## 9              - i.health  1 2.011350e-02        48   2.185520 -216.2703
## 10       - i.food.accommo  1 1.308001e-02        49   2.198600 -217.8108
## 11           - race.latin  1 1.419612e-02        50   2.212796 -219.3152
## 12             - i.mining  1 2.950924e-02        51   2.242305 -220.2952
## 13     - tickprice.diff.1  1 2.782312e-02        52   2.270128 -221.3456
## 14        - z.pct.att.pop  1 4.543336e-02        53   2.315562 -221.8198
## 15           - Median.Age  1 4.422811e-02        54   2.359790 -222.3629
## 16 - i.company.management  1 1.699285e-02        55   2.376782 -223.8104
## 17    - adj.median.income  1 4.901062e-02        56   2.425793 -224.2388
## 18             - i.retail  1 3.501037e-02        57   2.460803 -225.1354
## 19      - i.manufacturing  1 4.638659e-02        58   2.507190 -225.6975
## 20     - i.arts.entertain  1 1.861312e-02        59   2.525803 -227.1280
## 21       - i.construction  1 1.590265e-02        60   2.541706 -228.6447
## 22      - i.sci.technical  1 1.631322e-02        61   2.558019 -230.1521
## 23          - m.per.f.100  1 3.405628e-02        62   2.592075 -231.1337
## 24               - i.agri  1 4.710081e-02        63   2.639176 -231.7471
## 25          - i.wholesale  1 3.196753e-02        64   2.671144 -232.8200

summary(step.back)

## 
## Call:
## lm(formula = cor.1by1.big ~ Pop + race.white + race.black + race.asian + 
##     race.indigenous + race.other + i.transportation + i.admin.waste + 
##     i.education + i.other + pct.tick.income + payroll.pct, data = trainset.stripped)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49401 -0.13496  0.01773  0.13799  0.38065 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.648e+01  4.460e+00   3.694 0.000459 ***
## Pop              -7.575e-08  2.035e-08  -3.722 0.000419 ***
## race.white       -1.575e-01  4.398e-02  -3.582 0.000659 ***
## race.black       -1.424e-01  4.258e-02  -3.344 0.001384 ** 
## race.asian       -1.757e-01  4.827e-02  -3.639 0.000548 ***
## race.indigenous  -1.175e-01  7.672e-02  -1.532 0.130418    
## race.other       -1.266e-01  4.345e-02  -2.914 0.004909 ** 
## i.transportation  4.262e-02  2.465e-02   1.729 0.088644 .  
## i.admin.waste    -1.370e-01  4.919e-02  -2.784 0.007046 ** 
## i.education      -3.814e-02  1.960e-02  -1.946 0.056077 .  
## i.other          -7.303e-02  2.783e-02  -2.624 0.010852 *  
## pct.tick.income  -1.878e+00  7.188e-01  -2.612 0.011189 *  
## payroll.pct       1.806e-03  1.204e-03   1.500 0.138408    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2043 on 64 degrees of freedom
## Multiple R-squared:  0.4353, Adjusted R-squared:  0.3294 
## F-statistic: 4.111 on 12 and 64 DF,  p-value: 9.588e-05

performance <- as.data.frame(test_cor)

performance$step.back <- predict(step.back, testset)
performance$step <- predict(step, testset)

performance <- performance[order(performance$test_cor),]

plot(performance$test_cor)
lines(performance$step.back,type='l',col='blue')
lines(performance$step,type='l',col='red')

Looks like another failed model. Let’s try one more time:

set.seed(4)

train_index <- createDataPartition(coreset$cor.1by1.big, times = 1, p = 0.7, list = F)
trainset <- coreset[train_index,]
testset <- coreset[-train_index,]

train_cor <- trainset$cor.1by1.big
test_cor <- testset$cor.1by1.big
testset<-testset[,-1]

cor.test <- cor(trainset, use = "na.or.complete")
cor.test.melt <- arrange(melt(cor.test), -abs(value))

trainset.stripped <- trainset[,-c(6,7,36:38,42)]

every.lm.df <- data.frame(colnames(trainset.stripped[,-1]))
every.lm.df$pvalue <- NA
every.lm.df$rsquared <- NA
names(every.lm.df)[1] <- "variable"

###tried to build a function but did not work
fit1 <- lm(cor.1by1.big~adj.ticket.price, data = trainset.stripped)
fit2 <- lm(cor.1by1.big~Pop, data = trainset.stripped)
fit3 <- lm(cor.1by1.big~m.per.f.100, data = trainset.stripped)
fit4 <- lm(cor.1by1.big~Median.Age, data = trainset.stripped)
fit5 <- lm(cor.1by1.big~adj.median.income, data = trainset.stripped)
fit6 <- lm(cor.1by1.big~race.white, data = trainset.stripped)
fit7 <- lm(cor.1by1.big~race.black, data = trainset.stripped)
fit8 <- lm(cor.1by1.big~race.asian, data = trainset.stripped)
fit9 <- lm(cor.1by1.big~race.latin, data = trainset.stripped)
fit10 <- lm(cor.1by1.big~race.indigenous, data = trainset.stripped)
fit11 <- lm(cor.1by1.big~race.other, data = trainset.stripped)
fit12 <- lm(cor.1by1.big~race.mixed, data = trainset.stripped)
fit13 <- lm(cor.1by1.big~i.agri, data = trainset.stripped)
fit14 <- lm(cor.1by1.big~i.mining, data = trainset.stripped)
fit15 <- lm(cor.1by1.big~i.utilities, data = trainset.stripped)
fit16 <- lm(cor.1by1.big~i.construction, data = trainset.stripped)
fit17 <- lm(cor.1by1.big~i.manufacturing, data = trainset.stripped)
fit18 <- lm(cor.1by1.big~i.wholesale, data = trainset.stripped)
fit19 <- lm(cor.1by1.big~i.retail, data = trainset.stripped)
fit20 <- lm(cor.1by1.big~i.transportation, data = trainset.stripped)
fit21 <- lm(cor.1by1.big~i.info.culture, data = trainset.stripped)
fit22 <- lm(cor.1by1.big~i.finance.insur, data = trainset.stripped)
fit23 <- lm(cor.1by1.big~i.realestate, data = trainset.stripped)
fit24 <- lm(cor.1by1.big~i.sci.technical, data = trainset.stripped)
fit25 <- lm(cor.1by1.big~i.company.management, data = trainset.stripped)
fit26 <- lm(cor.1by1.big~i.admin.waste, data = trainset.stripped)
fit27 <- lm(cor.1by1.big~i.education, data = trainset.stripped)
fit28 <- lm(cor.1by1.big~i.health, data = trainset.stripped)
fit29 <- lm(cor.1by1.big~i.arts.entertain, data = trainset.stripped)
fit30 <- lm(cor.1by1.big~i.food.accommo, data = trainset.stripped)
fit31 <- lm(cor.1by1.big~i.other, data = trainset.stripped)
fit32 <- lm(cor.1by1.big~i.pub.admin, data = trainset.stripped)
fit33 <- lm(cor.1by1.big~z.pct.att.pop, data = trainset.stripped)
fit34 <- lm(cor.1by1.big~pct.tick.income, data = trainset.stripped)
fit35 <- lm(cor.1by1.big~payroll.pct, data = trainset.stripped)
fit36 <- lm(cor.1by1.big~tickprice.diff.1, data = trainset.stripped)

every.lm.df[1,2] <- anova(fit1)$'Pr(>F)'[1]
every.lm.df[2,2] <- anova(fit2)$'Pr(>F)'[1] 
every.lm.df[3,2] <- anova(fit3)$'Pr(>F)'[1] 
every.lm.df[4,2] <- anova(fit4)$'Pr(>F)'[1] 
every.lm.df[5,2] <- anova(fit5)$'Pr(>F)'[1] 
every.lm.df[6,2] <- anova(fit6)$'Pr(>F)'[1] 
every.lm.df[7,2] <- anova(fit7)$'Pr(>F)'[1] 
every.lm.df[8,2] <- anova(fit8)$'Pr(>F)'[1] 
every.lm.df[9,2] <- anova(fit9)$'Pr(>F)'[1] 
every.lm.df[10,2] <- anova(fit10)$'Pr(>F)'[1] 
every.lm.df[11,2] <- anova(fit11)$'Pr(>F)'[1] 
every.lm.df[12,2] <- anova(fit12)$'Pr(>F)'[1] 
every.lm.df[13,2] <- anova(fit13)$'Pr(>F)'[1] 
every.lm.df[14,2] <- anova(fit14)$'Pr(>F)'[1] 
every.lm.df[15,2] <- anova(fit15)$'Pr(>F)'[1] 
every.lm.df[16,2] <- anova(fit16)$'Pr(>F)'[1] 
every.lm.df[17,2] <- anova(fit17)$'Pr(>F)'[1] 
every.lm.df[18,2] <- anova(fit18)$'Pr(>F)'[1] 
every.lm.df[19,2] <- anova(fit19)$'Pr(>F)'[1]
every.lm.df[20,2] <- anova(fit20)$'Pr(>F)'[1] 
every.lm.df[21,2] <- anova(fit21)$'Pr(>F)'[1] 
every.lm.df[22,2] <- anova(fit22)$'Pr(>F)'[1]
every.lm.df[23,2] <- anova(fit23)$'Pr(>F)'[1] 
every.lm.df[24,2] <- anova(fit24)$'Pr(>F)'[1] 
every.lm.df[25,2] <- anova(fit25)$'Pr(>F)'[1] 
every.lm.df[26,2] <- anova(fit26)$'Pr(>F)'[1] 
every.lm.df[27,2] <- anova(fit27)$'Pr(>F)'[1] 
every.lm.df[28,2] <- anova(fit28)$'Pr(>F)'[1] 
every.lm.df[29,2] <- anova(fit29)$'Pr(>F)'[1] 
every.lm.df[30,2] <- anova(fit30)$'Pr(>F)'[1] 
every.lm.df[31,2] <- anova(fit31)$'Pr(>F)'[1] 
every.lm.df[32,2] <- anova(fit32)$'Pr(>F)'[1] 
every.lm.df[33,2] <- anova(fit33)$'Pr(>F)'[1] 
every.lm.df[34,2] <- anova(fit34)$'Pr(>F)'[1] 
every.lm.df[35,2] <- anova(fit35)$'Pr(>F)'[1] 
every.lm.df[36,2] <- anova(fit36)$'Pr(>F)'[1]

every.lm.df[1,3] <- summary(fit1)$r.squared
every.lm.df[2,3] <- summary(fit2)$r.squared 
every.lm.df[3,3] <- summary(fit3)$r.squared
every.lm.df[4,3] <- summary(fit4)$r.squared
every.lm.df[5,3] <- summary(fit5)$r.squared 
every.lm.df[6,3] <- summary(fit6)$r.squared
every.lm.df[7,3] <- summary(fit7)$r.squared 
every.lm.df[8,3] <- summary(fit8)$r.squared
every.lm.df[9,3] <- summary(fit9)$r.squared 
every.lm.df[10,3] <- summary(fit10)$r.squared 
every.lm.df[11,3] <- summary(fit11)$r.squared 
every.lm.df[12,3] <- summary(fit12)$r.squared 
every.lm.df[13,3] <- summary(fit13)$r.squared 
every.lm.df[14,3] <- summary(fit14)$r.squared 
every.lm.df[15,3] <- summary(fit15)$r.squared 
every.lm.df[16,3] <- summary(fit16)$r.squared 
every.lm.df[17,3] <- summary(fit17)$r.squared 
every.lm.df[18,3] <- summary(fit18)$r.squared 
every.lm.df[19,3] <- summary(fit19)$r.squared
every.lm.df[20,3] <- summary(fit20)$r.squared 
every.lm.df[21,3] <- summary(fit21)$r.squared 
every.lm.df[22,3] <- summary(fit22)$r.squared
every.lm.df[23,3] <- summary(fit23)$r.squared 
every.lm.df[24,3] <- summary(fit24)$r.squared 
every.lm.df[25,3] <- summary(fit25)$r.squared 
every.lm.df[26,3] <- summary(fit26)$r.squared 
every.lm.df[27,3] <- summary(fit27)$r.squared 
every.lm.df[28,3] <- summary(fit28)$r.squared 
every.lm.df[29,3] <- summary(fit29)$r.squared 
every.lm.df[30,3] <- summary(fit30)$r.squared 
every.lm.df[31,3] <- summary(fit31)$r.squared 
every.lm.df[32,3] <-summary(fit32)$r.squared 
every.lm.df[33,3] <- summary(fit33)$r.squared
every.lm.df[34,3] <- summary(fit34)$r.squared 
every.lm.df[35,3] <- summary(fit35)$r.squared 
every.lm.df[36,3] <- summary(fit36)$r.squared

full <- lm(cor.1by1.big~., data = trainset.stripped)

step <- stepAIC(full, direction = "both", trace = F)
step$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## Final Model:
## cor.1by1.big ~ adj.ticket.price + m.per.f.100 + Median.Age + 
##     race.white + race.black + race.asian + race.latin + race.indigenous + 
##     race.other + i.agri + i.mining + i.utilities + i.finance.insur + 
##     i.company.management + i.education + i.health + i.other + 
##     payroll.pct
## 
## 
##                   Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                             41   2.520170 -191.2999
## 2        - i.pub.admin  0 0.000000e+00        41   2.520170 -191.2999
## 3    - i.sci.technical  1 4.341235e-07        42   2.520170 -193.2999
## 4           - i.retail  1 3.632728e-05        43   2.520207 -195.2988
## 5      - i.admin.waste  1 1.902793e-04        44   2.520397 -197.2930
## 6    - pct.tick.income  1 1.494157e-03        45   2.521891 -199.2473
## 7      - z.pct.att.pop  1 3.564715e-03        46   2.525456 -201.1386
## 8   - i.transportation  1 3.156890e-03        47   2.528613 -203.0424
## 9         - race.mixed  1 6.023364e-03        48   2.534636 -204.8592
## 10 - adj.median.income  1 5.333797e-03        49   2.539970 -206.6973
## 11  - i.arts.entertain  1 5.793448e-03        50   2.545763 -208.5219
## 12       - i.wholesale  1 7.659889e-03        51   2.553423 -210.2905
## 13    - i.info.culture  1 8.909748e-03        52   2.562333 -212.0223
## 14    - i.food.accommo  1 9.999666e-03        53   2.572333 -213.7224
## 15    - i.construction  1 6.807192e-03        54   2.579140 -215.5189
## 16  - tickprice.diff.1  1 2.555870e-02        55   2.604698 -216.7596
## 17               - Pop  1 4.380728e-02        56   2.648506 -217.4754
## 18      - i.realestate  1 2.822534e-02        57   2.676731 -218.6591
## 19   - i.manufacturing  1 5.342762e-02        58   2.730159 -219.1373

summary(step)

## 
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + m.per.f.100 + 
##     Median.Age + race.white + race.black + race.asian + race.latin + 
##     race.indigenous + race.other + i.agri + i.mining + i.utilities + 
##     i.finance.insur + i.company.management + i.education + i.health + 
##     i.other + payroll.pct, data = trainset.stripped)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52901 -0.08166  0.00237  0.10699  0.49528 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          17.585521   5.971909   2.945  0.00465 **
## adj.ticket.price     -0.002946   0.001696  -1.737  0.08769 . 
## m.per.f.100          -0.020357   0.015836  -1.286  0.20373   
## Median.Age            0.062546   0.025919   2.413  0.01900 * 
## race.white           -0.171731   0.053615  -3.203  0.00221 **
## race.black           -0.173062   0.053271  -3.249  0.00193 **
## race.asian           -0.206767   0.059874  -3.453  0.00104 **
## race.latin           -0.007565   0.003741  -2.022  0.04780 * 
## race.indigenous      -0.324256   0.117851  -2.751  0.00790 **
## race.other           -0.126541   0.051563  -2.454  0.01715 * 
## i.agri               -0.898461   0.430207  -2.088  0.04116 * 
## i.mining             -0.211939   0.098620  -2.149  0.03582 * 
## i.utilities           1.045356   0.310692   3.365  0.00136 **
## i.finance.insur      -0.089918   0.034397  -2.614  0.01138 * 
## i.company.management  2.638886   1.003342   2.630  0.01091 * 
## i.education           0.055289   0.018359   3.012  0.00385 **
## i.health             -0.059161   0.021129  -2.800  0.00693 **
## i.other              -0.077979   0.048001  -1.625  0.10968   
## payroll.pct           0.002614   0.001329   1.967  0.05398 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.217 on 58 degrees of freedom
## Multiple R-squared:  0.4241, Adjusted R-squared:  0.2454 
## F-statistic: 2.373 on 18 and 58 DF,  p-value: 0.006829

step.for <- stepAIC(full, direction = "forward", trace = F)
step.for$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## Final Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## 
##   Step Df Deviance Resid. Df Resid. Dev       AIC
## 1                         41    2.52017 -191.2999

summary(step.for)

## 
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + 
##     Median.Age + adj.median.income + race.white + race.black + 
##     race.asian + race.latin + race.indigenous + race.other + 
##     race.mixed + i.agri + i.mining + i.utilities + i.construction + 
##     i.manufacturing + i.wholesale + i.retail + i.transportation + 
##     i.info.culture + i.finance.insur + i.realestate + i.sci.technical + 
##     i.company.management + i.admin.waste + i.education + i.health + 
##     i.arts.entertain + i.food.accommo + i.other + i.pub.admin + 
##     z.pct.att.pop + pct.tick.income + payroll.pct + tickprice.diff.1, 
##     data = trainset.stripped)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53887 -0.10530 -0.01021  0.11738  0.53065 
## 
## Coefficients: (1 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           1.093e+01  1.593e+01   0.686   0.4965  
## adj.ticket.price     -4.988e-03  9.921e-03  -0.503   0.6178  
## Pop                  -3.465e-08  1.044e-07  -0.332   0.7417  
## m.per.f.100          -1.411e-02  3.765e-02  -0.375   0.7098  
## Median.Age            9.765e-02  8.292e-02   1.178   0.2457  
## adj.median.income    -3.933e-06  1.985e-05  -0.198   0.8439  
## race.white           -1.213e-01  1.552e-01  -0.782   0.4390  
## race.black           -1.215e-01  1.556e-01  -0.780   0.4396  
## race.asian           -1.543e-01  1.689e-01  -0.914   0.3661  
## race.latin           -7.574e-03  9.080e-03  -0.834   0.4091  
## race.indigenous      -3.572e-01  2.794e-01  -1.278   0.2084  
## race.other           -7.089e-02  1.556e-01  -0.456   0.6511  
## race.mixed            4.532e-02  1.563e-01   0.290   0.7734  
## i.agri               -2.776e-01  1.264e+00  -0.220   0.8272  
## i.mining             -4.662e-02  3.037e-01  -0.154   0.8788  
## i.utilities           6.471e-01  7.632e-01   0.848   0.4015  
## i.construction       -3.726e-02  1.236e-01  -0.301   0.7646  
## i.manufacturing       2.712e-02  5.050e-02   0.537   0.5942  
## i.wholesale          -9.462e-02  2.246e-01  -0.421   0.6758  
## i.retail             -1.574e-03  7.032e-02  -0.022   0.9822  
## i.transportation      1.352e-02  6.683e-02   0.202   0.8407  
## i.info.culture       -5.364e-02  9.452e-02  -0.568   0.5735  
## i.finance.insur      -2.098e-02  9.091e-02  -0.231   0.8186  
## i.realestate          1.722e-01  4.365e-01   0.394   0.6953  
## i.sci.technical      -1.965e-04  7.396e-02  -0.003   0.9979  
## i.company.management  1.768e+00  2.075e+00   0.852   0.3990  
## i.admin.waste         6.900e-03  1.306e-01   0.053   0.9581  
## i.education           5.215e-02  5.561e-02   0.938   0.3539  
## i.health             -5.432e-02  4.520e-02  -1.202   0.2363  
## i.arts.entertain      2.506e-02  1.303e-01   0.192   0.8485  
## i.food.accommo       -4.774e-02  7.468e-02  -0.639   0.5262  
## i.other              -7.944e-02  1.126e-01  -0.706   0.4843  
## i.pub.admin                  NA         NA      NA       NA  
## z.pct.att.pop         1.656e-02  8.265e-02   0.200   0.8422  
## pct.tick.income       7.916e-01  5.063e+00   0.156   0.8765  
## payroll.pct           3.426e-03  1.804e-03   1.899   0.0646 .
## tickprice.diff.1      4.652e-03  1.124e-02   0.414   0.6810  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2479 on 41 degrees of freedom
## Multiple R-squared:  0.4684, Adjusted R-squared:  0.01466 
## F-statistic: 1.032 on 35 and 41 DF,  p-value: 0.4579

step.back<- stepAIC(full, direction = "backward", trace = F)
step.back$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age + 
##     adj.median.income + race.white + race.black + race.asian + 
##     race.latin + race.indigenous + race.other + race.mixed + 
##     i.agri + i.mining + i.utilities + i.construction + i.manufacturing + 
##     i.wholesale + i.retail + i.transportation + i.info.culture + 
##     i.finance.insur + i.realestate + i.sci.technical + i.company.management + 
##     i.admin.waste + i.education + i.health + i.arts.entertain + 
##     i.food.accommo + i.other + i.pub.admin + z.pct.att.pop + 
##     pct.tick.income + payroll.pct + tickprice.diff.1
## 
## Final Model:
## cor.1by1.big ~ adj.ticket.price + m.per.f.100 + Median.Age + 
##     race.white + race.black + race.asian + race.latin + race.indigenous + 
##     race.other + i.agri + i.mining + i.utilities + i.finance.insur + 
##     i.company.management + i.education + i.health + i.other + 
##     payroll.pct
## 
## 
##                   Step Df     Deviance Resid. Df Resid. Dev       AIC
## 1                                             41   2.520170 -191.2999
## 2        - i.pub.admin  0 0.000000e+00        41   2.520170 -191.2999
## 3    - i.sci.technical  1 4.341235e-07        42   2.520170 -193.2999
## 4           - i.retail  1 3.632728e-05        43   2.520207 -195.2988
## 5      - i.admin.waste  1 1.902793e-04        44   2.520397 -197.2930
## 6    - pct.tick.income  1 1.494157e-03        45   2.521891 -199.2473
## 7      - z.pct.att.pop  1 3.564715e-03        46   2.525456 -201.1386
## 8   - i.transportation  1 3.156890e-03        47   2.528613 -203.0424
## 9         - race.mixed  1 6.023364e-03        48   2.534636 -204.8592
## 10 - adj.median.income  1 5.333797e-03        49   2.539970 -206.6973
## 11  - i.arts.entertain  1 5.793448e-03        50   2.545763 -208.5219
## 12       - i.wholesale  1 7.659889e-03        51   2.553423 -210.2905
## 13    - i.info.culture  1 8.909748e-03        52   2.562333 -212.0223
## 14    - i.food.accommo  1 9.999666e-03        53   2.572333 -213.7224
## 15    - i.construction  1 6.807192e-03        54   2.579140 -215.5189
## 16  - tickprice.diff.1  1 2.555870e-02        55   2.604698 -216.7596
## 17               - Pop  1 4.380728e-02        56   2.648506 -217.4754
## 18      - i.realestate  1 2.822534e-02        57   2.676731 -218.6591
## 19   - i.manufacturing  1 5.342762e-02        58   2.730159 -219.1373

summary(step.back)

## 
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + m.per.f.100 + 
##     Median.Age + race.white + race.black + race.asian + race.latin + 
##     race.indigenous + race.other + i.agri + i.mining + i.utilities + 
##     i.finance.insur + i.company.management + i.education + i.health + 
##     i.other + payroll.pct, data = trainset.stripped)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52901 -0.08166  0.00237  0.10699  0.49528 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          17.585521   5.971909   2.945  0.00465 **
## adj.ticket.price     -0.002946   0.001696  -1.737  0.08769 . 
## m.per.f.100          -0.020357   0.015836  -1.286  0.20373   
## Median.Age            0.062546   0.025919   2.413  0.01900 * 
## race.white           -0.171731   0.053615  -3.203  0.00221 **
## race.black           -0.173062   0.053271  -3.249  0.00193 **
## race.asian           -0.206767   0.059874  -3.453  0.00104 **
## race.latin           -0.007565   0.003741  -2.022  0.04780 * 
## race.indigenous      -0.324256   0.117851  -2.751  0.00790 **
## race.other           -0.126541   0.051563  -2.454  0.01715 * 
## i.agri               -0.898461   0.430207  -2.088  0.04116 * 
## i.mining             -0.211939   0.098620  -2.149  0.03582 * 
## i.utilities           1.045356   0.310692   3.365  0.00136 **
## i.finance.insur      -0.089918   0.034397  -2.614  0.01138 * 
## i.company.management  2.638886   1.003342   2.630  0.01091 * 
## i.education           0.055289   0.018359   3.012  0.00385 **
## i.health             -0.059161   0.021129  -2.800  0.00693 **
## i.other              -0.077979   0.048001  -1.625  0.10968   
## payroll.pct           0.002614   0.001329   1.967  0.05398 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.217 on 58 degrees of freedom
## Multiple R-squared:  0.4241, Adjusted R-squared:  0.2454 
## F-statistic: 2.373 on 18 and 58 DF,  p-value: 0.006829

performance <- as.data.frame(test_cor)

performance$step.back <- predict(step.back, testset)
performance$step <- predict(step, testset)

performance <- performance[order(performance$test_cor),]

plot(performance$test_cor)
lines(performance$step.back,type='l',col='blue')
lines(performance$step,type='l',col='red')

Nothing. Looks like none of our variables can predict the fairweatherness of a fanbase at a significant level.

Capstone

Shaun Ono

7/31/2017