library(stringr)
library(dplyr)
library(reshape2)
library(XML)
library(gsubfn)
library(data.table)
library(tidyr)
library(zoo)
library(lattice)
library(caret)
library(MASS)
library(splitstackshape)
library(ggplot2)
library(plotly)
library(plyr)
I begin first by loading each of the league datasets. These datasets were manually compiled and contains yearly information (winning pct, ticket prices, attendance…) for every team in the MLB, NHL, NBA and NFL from 1991:2016. I’ll clean up some of the variables as well as add in the salary cap for each league by year. I’ll then bind the leagues together to create one large dataset.
mlb <- read.csv("mlb.csv", header = T, stringsAsFactors = F)
nhl<- read.csv("nhl.csv", header = T, stringsAsFactors = F)
nba <- read.csv("nba.csv", header = T, stringsAsFactors = F)
nfl <- read.csv("nfl.csv", header = T, stringsAsFactors = F)
as.numeric.factor <- function(x) {as.numeric(levels(x))[x]}
nfl$Payroll <- str_trim(nfl$Payroll)
nfl$Payroll <- as.numeric(nfl$Payroll)
nfl$Salary.Cap <- as.numeric(nfl$Salary.Cap)
mlb$Salary.Cap <- as.numeric(mlb$Salary.Cap)
nba$Salary.Cap <- as.numeric(nba$Salary.Cap)
nhl$Salary.Cap <- as.numeric(nhl$Salary.Cap)
nfl$Salary.Cap[nfl$Year == 2009] <- 123000000
nfl$Salary.Cap[nfl$Year == 2008] <- 116000000
nfl$Salary.Cap[nfl$Year == 2007] <-109000000
nfl$Salary.Cap[nfl$Year == 2006] <-102000000
nfl$Salary.Cap[nfl$Year == 2005] <-85500000
nfl$Salary.Cap[nfl$Year == 2004] <-80582000
nfl$Salary.Cap[nfl$Year == 2003] <-75007000
nfl$Salary.Cap[nfl$Year == 2002] <-71101000
nfl$Salary.Cap[nfl$Year == 2001] <-67405000
nfl$Salary.Cap[nfl$Year == 2000] <-62172000
nfl$Salary.Cap[nfl$Year == 2010] <- NA
nfl$Salary.Cap[nfl$Year == 2011] <- 120000000
nfl$Salary.Cap[nfl$Year == 2012] <-120600000
nfl$Salary.Cap[nfl$Year == 2013] <-123000000
nfl$Salary.Cap[nfl$Year == 2014] <-133000000
nfl$Salary.Cap[nfl$Year == 2015] <-143280000
nfl$Salary.Cap[nfl$Year == 2016] <-155270000
nfl$Salary.Cap[nfl$Year == 1994] <-34608000
nfl$Salary.Cap[nfl$Year == 1995] <-37100000
nfl$Salary.Cap[nfl$Year == 1996] <-40753000
nfl$Salary.Cap[nfl$Year == 1997] <-41454000
nfl$Salary.Cap[nfl$Year == 1998] <-52388000
nfl$Salary.Cap[nfl$Year == 1999] <-57288000
nhl$Salary.Cap[nhl$Year == 2005] <-39000000
nhl$Salary.Cap[nhl$Year == 2006] <-44000000
nhl$Salary.Cap[nhl$Year == 2007] <-50300000
nhl$Salary.Cap[nhl$Year == 2008] <-56700000
nhl$Salary.Cap[nhl$Year == 2009] <-56800000
nhl$Salary.Cap[nhl$Year == 2010] <-59400000
nhl$Salary.Cap[nhl$Year == 2011] <-64300000
nhl$Salary.Cap[nhl$Year == 2012] <-60000000
nhl$Salary.Cap[nhl$Year == 2013] <-64300000
nhl$Salary.Cap[nhl$Year == 2014] <-69000000
nhl$Salary.Cap[nhl$Year == 2015] <-71400000
nhl$Salary.Cap[nhl$Year == 2016] <-73000000
nba$Salary.Cap[nba$Year == 1991] <-12500000
nba$Salary.Cap[nba$Year == 1992] <-14000000
nba$Salary.Cap[nba$Year == 1993] <-15175000
nba$Salary.Cap[nba$Year == 1994] <-15964000
nba$Salary.Cap[nba$Year == 1995] <-23000000
nba$Salary.Cap[nba$Year == 1996] <-24363000
nba$Salary.Cap[nba$Year == 1997] <-26900000
nba$Salary.Cap[nba$Year == 1998] <-30000000
nba$Salary.Cap[nba$Year == 1999] <-34000000
nba$Salary.Cap[nba$Year == 2000] <-35500000
nba$Salary.Cap[nba$Year == 2001] <-42500000
nba$Salary.Cap[nba$Year == 2002] <-40271000
nba$Salary.Cap[nba$Year == 2003] <-43840000
nba$Salary.Cap[nba$Year == 2004] <-43870000
nba$Salary.Cap[nba$Year == 2005] <-49500000
nba$Salary.Cap[nba$Year == 2006] <-53135000
nba$Salary.Cap[nba$Year == 2007] <-55630000
nba$Salary.Cap[nba$Year == 2008] <-58680000
nba$Salary.Cap[nba$Year == 2009] <-57700000
nba$Salary.Cap[nba$Year == 2010] <-58044000
nba$Salary.Cap[nba$Year == 2011] <-58044000
nba$Salary.Cap[nba$Year == 2012] <-58044000
nba$Salary.Cap[nba$Year == 2013] <-58679000
nba$Salary.Cap[nba$Year == 2014] <-63065000
nba$Salary.Cap[nba$Year == 2015] <-70000000
nba$Salary.Cap[nba$Year == 2016] <-94143000
mlb$Salary.Cap[mlb$Year == 2003] <-117000000
mlb$Salary.Cap[mlb$Year == 2004] <-120500000
mlb$Salary.Cap[mlb$Year == 2005] <-128000000
mlb$Salary.Cap[mlb$Year == 2006] <-136500000
mlb$Salary.Cap[mlb$Year == 2007] <-148000000
mlb$Salary.Cap[mlb$Year == 2008] <-155000000
mlb$Salary.Cap[mlb$Year == 2009] <-162000000
mlb$Salary.Cap[mlb$Year == 2010] <-170000000
mlb$Salary.Cap[mlb$Year == 2011] <-178000000
mlb$Salary.Cap[mlb$Year == 2012] <-178000000
mlb$Salary.Cap[mlb$Year == 2013] <-178000000
mlb$Salary.Cap[mlb$Year == 2014] <-189000000
mlb$Salary.Cap[mlb$Year == 2015] <-189000000
mlb$Salary.Cap[mlb$Year == 2016] <-189000000
four <- rbind(mlb[c(1:20)], nfl[c(1:20)], nba[c(1:20)], nhl[c(1:20)])
Now to attach demographic information to each city.
I will begin with population data that has been interpolated in Excel for the years in between census.
pop.cities <- read.csv("city_pop.csv", header = T)
pop.cities <- pop.cities[1:3]
majorfour <- merge(four, pop.cities, all.x = T, by = c("City", "Year"))
majorfour <- majorfour[,-15]
majorfour <- mutate(majorfour, att.pct = majorfour$Ave.Attendance/majorfour$Capacity)
##A mistake (will be found later)
majorfour$T.Attendance[majorfour$Year == 2006 & majorfour$Team == "Oakland Raiders"] <- 467964
majorfour$Ave.Attendance[majorfour$Year == 2006 & majorfour$Team == "Oakland Raiders"] <- 467964/8
head(majorfour)
## City Year Tm.ABV Team State League
## 1 Anaheim 1991 LAA Los Angeles Angels of Anaheim California MLB
## 2 Anaheim 1992 LAA Los Angeles Angels of Anaheim California MLB
## 3 Anaheim 1993 ANA Anaheim Ducks California NHL
## 4 Anaheim 1993 LAA Los Angeles Angels of Anaheim California MLB
## 5 Anaheim 1994 ANA Anaheim Ducks California NHL
## 6 Anaheim 1994 LAA Los Angeles Angels of Anaheim California MLB
## Venue Capacity T.Attendance Ave.Attendance Ave.Tick
## 1 Angel Stadium of Anaheim 64593 2416236 29830 8.76
## 2 Angel Stadium of Anaheim 64593 2065444 25499 8.02
## 3 Honda Center 17200 685848 16728 NA
## 4 Angel Stadium of Anaheim 64593 2057460 25401 8.02
## 5 Honda Center 17174 704134 17174 33.22
## 6 Angel Stadium of Anaheim 64593 1512622 24010 8.06
## Wpct Payroll Salary.Cap Median.Income Median.Age Ave.Temp TZ Age
## 1 0.500 33060001 NA NA NA NA NA 29.35
## 2 0.444 34749334 NA NA NA NA NA 28.45
## 3 0.423 8700000 NA NA NA NA NA NA
## 4 0.438 28588334 NA NA NA NA NA 28.05
## 5 0.385 13200000 NA NA NA NA NA NA
## 6 0.409 25156218 NA NA NA NA NA 28.35
## Pop att.pct
## 1 270608 0.4618147
## 2 274612 0.3947641
## 3 278740 0.9725581
## 4 278740 0.3932469
## 5 281772 1.0000000
## 6 281772 0.3717121
Now to begin loading the demographic data pulled from the US (1990, 2000, 2010, 2015) and Canadian Census (1996, 2001, 2010, 2016). I begin with the American census, creating a list of the American cities:
city.frame <- as.data.frame(table(majorfour$City))
city.frame <- city.frame[1]
names(city.frame)[1] <- "City"
citylist <- as.character(city.frame$City)
citylist <- citylist[-c(6, 15, 27, 35, 40, 51,52,54)]
citylist[14]<- "GreenBay"
citylist[19]<- "KansasCity"
citylist[20]<- "LosAngeles"
citylist[26]<- "NewOrleans"
citylist[27]<- "NewYork"
citylist[30]<- "Oklahoma"
citylist[38]<- "SaltLakeCity"
citylist[39]<- "SanAntonio"
citylist[40]<- "SanDiego"
citylist[41]<- "SanFransisco"
citylist[42]<- "SanJose"
citylist[44]<- "StLouis"
citylist[45]<- "Tampa"
citylist[46]<- "Washington"
citylist <- citylist[-15]
The demographic data has been pulled from individual files from each city/demographic info/census. I begin by loading the data and binding the cities together to create full datasets of each demographic variables from each census.
The first example is the Age Demographic information from the 2000 US Census. Here I create a function that searches from a list of all the files and then creates dataframes from those .csv files. I am then able to call these dataframes to bind the df’s into one large dataframe. A small caveat is that the 2000 Census demographic data does not contain information on Nashville, so I have had to account for this:
citylist.nonash <- citylist[-24]
filenames.Age2000 <- sprintf("Age_2000_%s.csv", citylist.nonash)
for (i in filenames.Age2000){
name <- gsub(".csv", "", i)
name <- gsub("Age_2000_", "", name)
name <- paste("Age", name, sep = ".")
all_content <- readLines(i)
skip_second <- all_content[-1]
assign(name, read.csv(textConnection(skip_second), header = T))
}
dfnames <- sprintf("Age.%s", citylist.nonash)
df1 <- '"Age.Anaheim" "Age.Atlanta" "Age.Baltimore" "Age.Boston" "Age.Buffalo"
[6] "Age.Charlotte" "Age.Chicago" "Age.Cincinnati" "Age.Cleveland" "Age.Columbus"
[11] "Age.Dallas" "Age.Denver" "Age.Detroit" "Age.GreenBay" "Age.Houston"
[16] "Age.Indianapolis" "Age.Jacksonville" "Age.KansasCity" "Age.LosAngeles" "Age.Memphis"
[21] "Age.Miami" "Age.Milwaukee" "Age.Minneapolis" "Age.NewOrleans" "Age.NewYork"
[26] "Age.Newark" "Age.Oakland" "Age.Oklahoma" "Age.Orlando" "Age.Philadelphia"
[31] "Age.Phoenix" "Age.Pittsburgh" "Age.Portland" "Age.Raleigh" "Age.Sacramento"
[36] "Age.SaltLakeCity" "Age.SanAntonio" "Age.SanDiego" "Age.SanFransisco" "Age.SanJose"
[41] "Age.Seattle" "Age.StLouis" "Age.Tampa" "Age.Washington"'
df1 <- gsub('"', "", df1)
df1 <- gsub("\\[", "", df1)
df1 <- gsub("\\]", "", df1)
df1 <- gsub("\\d", "", df1)
df1 <- gsub("\\s+", " ", df1)
df1 <- gsub("\\s", ", ", df1)
Age2000 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))
I can now repeat these steps with the rest of the demographic datasets. I will be re-using names to save some time. There is also a bit of cleaning up for each of the datasets:
###Age2010
filenames.Age2010 <- sprintf("Age_2010_%s.csv", citylist)
for (i in filenames.Age2010){
name <- gsub(".csv", "", i)
name <- gsub("Age_2010_", "", name)
name <- paste("Age", name, sep = ".")
all_content <- readLines(i)
skip_second <- all_content[-1]
assign(name, read.csv(textConnection(skip_second), header = T))
}
Age2010 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))
Age2010[4] <- lapply(Age2010[4], function (x) gsub("\\([^\\)]+\\)", "", x))
###AGE2015
filenames.Age2015 <- sprintf("Age_2015_%s.csv", citylist)
for (i in filenames.Age2015){
name <- gsub(".csv", "", i)
name <- gsub("Age_2015_", "", name)
name <- paste("Age", name, sep = ".")
all_content <- readLines(i)
skip_second <- all_content[-1]
assign(name, read.csv(textConnection(skip_second), header = T))
}
Age2015 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))
Age2015[5] <- lapply(Age2015[5], function (x) gsub("\\*\\*\\*\\*\\*", NA, x))
###INCOME2000
filenames.Income2000 <- sprintf("Income_2000_%s.csv", citylist.nonash)
for (i in filenames.Income2000){
name <- gsub(".csv", "", i)
name <- gsub("Income_2000_", "", name)
name <- paste("Age", name, sep = ".")
all_content <- readLines(i)
skip_second <- all_content[-1]
assign(name, read.csv(textConnection(skip_second), header = T))
}
Income2000 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))
###INCOME2010
filenames.Income2010 <- sprintf("Income_2010_%s.csv", citylist)
for (i in filenames.Income2010){
name <- gsub(".csv", "", i)
name <- gsub("Income_2010_", "", name)
name <- paste("Age", name, sep = ".")
all_content <- readLines(i)
skip_second <- all_content[-1]
assign(name, read.csv(textConnection(skip_second), header = T))
}
Income2010 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))
###INCOME2015
filenames.Income2015 <- sprintf("Income_2015_%s.csv", citylist)
for (i in filenames.Income2015){
name <- gsub(".csv", "", i)
name <- gsub("Income_2015_", "", name)
name <- paste("Age", name, sep = ".")
all_content <- readLines(i)
skip_second <- all_content[-1]
assign(name, read.csv(textConnection(skip_second), header = T))
}
Income2015 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))
###INDUSTRY2015
filenames.Industry2015 <- sprintf("Industry_2015_%s.csv", citylist)
for (i in filenames.Industry2015){
name <- gsub(".csv", "", i)
name <- gsub("Industry_2015_", "", name)
name <- paste("Age", name, sep = ".")
all_content <- readLines(i)
skip_second <- all_content[-1]
assign(name, read.csv(textConnection(skip_second), header = T))
}
Industry2015 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))
###RACE2015
filenames.Race2015 <- sprintf("Race_2015_%s.csv", citylist)
for (i in filenames.Race2015){
name <- gsub(".csv", "", i)
name <- gsub("Race_2015_", "", name)
name <- paste("Age", name, sep = ".")
all_content <- readLines(i)
skip_second <- all_content[-1]
assign(name, read.csv(textConnection(skip_second), header = T))
}
Race2015 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))
Race2015[5] <- lapply(Race2015[5], function (x) gsub("\\*\\*\\*\\*\\*", NA, x))
###RACE2010
filenames.Race2010 <- sprintf("Race_2010_%s.csv", citylist)
for (i in filenames.Race2010){
name <- gsub(".csv", "", i)
name <- gsub("Race_2010_", "", name)
name <- paste("Age", name, sep = ".")
all_content <- readLines(i)
skip_second <- all_content[-1]
assign(name, read.csv(textConnection(skip_second), header = T))
}
Race2010 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.Nashville, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))
Race2010$Number..RACE...Total.population <- lapply(Race2010$Number..RACE...Total.population, function (x) gsub("\\([^\\)]+\\)", "", x))
Race2010[52] <- lapply(Race2010[52], function (x) gsub("\\([^\\)]+\\)", "", x))
Race2010[38] <- lapply(Race2010[38], function (x) gsub("\\([^\\)]+\\)", "", x))
###RACE2000
filenames.Race2000 <- sprintf("Race_2000_%s.csv", citylist.nonash)
for (i in filenames.Race2000){
name <- gsub(".csv", "", i)
name <- gsub("Race_2000_", "", name)
name <- paste("Age", name, sep = ".")
all_content <- readLines(i)
skip_second <- all_content[-1]
assign(name, read.csv(textConnection(skip_second), header = T))
}
Race2000 <- rbind.fill(list(Age.Anaheim, Age.Atlanta, Age.Baltimore, Age.Boston, Age.Buffalo, Age.Charlotte, Age.Chicago, Age.Cincinnati, Age.Cleveland, Age.Columbus, Age.Dallas, Age.Denver, Age.Detroit, Age.GreenBay, Age.Houston, Age.Indianapolis, Age.Jacksonville, Age.KansasCity, Age.LosAngeles, Age.Memphis, Age.Miami, Age.Milwaukee, Age.Minneapolis, Age.NewOrleans, Age.NewYork, Age.Newark, Age.Oakland, Age.Oklahoma, Age.Orlando, Age.Philadelphia, Age.Phoenix, Age.Pittsburgh, Age.Portland, Age.Raleigh, Age.Sacramento, Age.SaltLakeCity, Age.SanAntonio, Age.SanDiego, Age.SanFransisco, Age.SanJose, Age.Seattle, Age.StLouis, Age.Tampa, Age.Washington))
###Census 1990
Census_1990 <- read.csv("citylist.csv", header = T)
Next is the Canadian Census data sets:
canlist <- c("Montreal", "Ottawa", "Toronto", "Winnipeg", "Calgary", "Edmonton", "Vancouver")
#Census 1996
filenames.Census <- sprintf("Census_1996_%s.csv", canlist)
for (i in filenames.Census){
name <- gsub(".csv", "", i)
name <- gsub("Census_1996_", "", name)
name <- paste("Census", name, sep = ".")
assign(name, read.csv(i, header = F))
}
Can_Census1996 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg))
###Age 2001
filenames.Census <- sprintf("Age_2001_%s.csv", canlist)
for (i in filenames.Census){
name <- gsub(".csv", "", i)
name <- gsub("Age_2001_", "", name)
name <- paste("Census", name, sep = ".")
assign(name, read.csv(i, header = T))
}
Can_Age2001 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg))
###Age 2006
filenames.Census <- sprintf("Age_2006_%s.csv", canlist)
for (i in filenames.Census){
name <- gsub(".csv", "", i)
name <- gsub("Age_2006_", "", name)
name <- paste("Census", name, sep = ".")
assign(name, read.csv(i, header = T))
}
Can_Age2006 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg))
###Age 2011
filenames.Census <- sprintf("Age_2011_%s.csv", canlist)
for (i in filenames.Census){
name <- gsub(".csv", "", i)
name <- gsub("Age_2011_", "", name)
name <- paste("Census", name, sep = ".")
assign(name, read.csv(i, header = T))
}
Can_Age2011 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg))
###Income 2001
Can_Income2001 <- read.csv("Income_2001_Allcity.csv", header = T)
###Income 2011
filenames.Census <- sprintf("Income_2011_%s.csv", canlist)
for (i in filenames.Census){
name <- gsub(".csv", "", i)
name <- gsub("Income_2011_", "", name)
name <- paste("Census", name, sep = ".")
assign(name, read.csv(i, header = T))
}
Can_Income2011 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg))
###Industry 2001
filenames.Census <- sprintf("Industry_2001_%s.csv", canlist)
for (i in filenames.Census){
name <- gsub(".csv", "", i)
name <- gsub("Industry_2001_", "", name)
name <- paste("Census", name, sep = ".")
assign(name, read.csv(i, header = T))
}
Can_Industry2001 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg))
###Industry 2011
filenames.Census <- sprintf("Industry_2011_%s.csv", canlist)
for (i in filenames.Census){
name <- gsub(".csv", "", i)
name <- gsub("Industry_2011_", "", name)
name <- paste("Census", name, sep = ".")
assign(name, read.csv(i, header = T))
}
Can_Industry2011 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg))
###Race 2001
filenames.Census <- sprintf("Race_2001_%s.csv", canlist)
for (i in filenames.Census){
name <- gsub(".csv", "", i)
name <- gsub("Race_2001_", "", name)
name <- paste("Census", name, sep = ".")
assign(name, read.csv(i, header = T))
}
Can_Race2001 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg))
###Race 2011
filenames.Census <- sprintf("Race_2011_%s.csv", canlist)
for (i in filenames.Census){
name <- gsub(".csv", "", i)
name <- gsub("Race_2011_", "", name)
name <- paste("Census", name, sep = ".")
assign(name, read.csv(i, header = F, sep = ","))
}
Can_Race2011 <- rbind.fill(list(Census.Calgary, Census.Edmonton, Census.Montreal, Census.Ottawa, Census.Toronto, Census.Vancouver, Census.Winnipeg))
###Can Census 2016
Can_Census2016 <- read.csv("Census_2016.csv", header = T)
The next step is to merge all this demographic information to the year by year team information found in the majourfour dataframe. I decided to create uniform census datasets that all contained the same 62 variables. Any missing variables were given NA’s. This was however quite difficult as each of the different demographic datasets contained different variables or different ways of interpreting these variables. I worked closely with the original data to know how to transform the data into the uniform datasets I was looking for:
##Can Census 1996
Can_Census1996 <- Can_Census1996[,c(1, 3, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 1622, 473, 474, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1036)]
names(Can_Census1996)[1] <- 'city'
names(Can_Census1996)[2] <- 'Population'
Can_Census1996 <- mutate(Can_Census1996, m.per.f.100 = (Can_Census1996$V7/Can_Census1996$V31)*100)
Can_Census1996 <- mutate(Can_Census1996, age.Less5 = (Can_Census1996$V8 + Can_Census1996$V32)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.5.9 = (Can_Census1996$V9 + Can_Census1996$V33)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.10.14 = (Can_Census1996$V10 + Can_Census1996$V34)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.15.19 = (Can_Census1996$V16 + Can_Census1996$V40)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.20.24 = (Can_Census1996$V17 + Can_Census1996$V41)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.25.29 = (Can_Census1996$V18 + Can_Census1996$V42)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.30.34 = (Can_Census1996$V19 + Can_Census1996$V43)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.35.39 = (Can_Census1996$V20 + Can_Census1996$V44)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.40.44 = (Can_Census1996$V21 + Can_Census1996$V45)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.45.49 = (Can_Census1996$V22 + Can_Census1996$V46)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.50.54 = (Can_Census1996$V23 + Can_Census1996$V47)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.55.59 = (Can_Census1996$V24 + Can_Census1996$V48)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.60.64 = (Can_Census1996$V25 + Can_Census1996$V49)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.65.69 = (Can_Census1996$V26 + Can_Census1996$V50)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.70.74 = (Can_Census1996$V27 + Can_Census1996$V51)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.75.79 = (Can_Census1996$V28 + Can_Census1996$V52)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.80.84 = (Can_Census1996$V29 + Can_Census1996$V53)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, age.Plus85 = (Can_Census1996$V30 + Can_Census1996$V54)/Can_Census1996[,2]*100)
Can_Census1996 <- mutate(Can_Census1996, Median.Income = Can_Census1996$V1622)
Can_Census1996 <- mutate(Can_Census1996, race.white = (Can_Census1996$V797 - Can_Census1996$V474)/Can_Census1996$V473*100)
Can_Census1996 <- mutate(Can_Census1996, race.black = Can_Census1996$V786/Can_Census1996$V473*100)
Can_Census1996 <- mutate(Can_Census1996, race.asian = (Can_Census1996$V787 + Can_Census1996$V788 + Can_Census1996$V789 + Can_Census1996$V790 + Can_Census1996$V791 + Can_Census1996$V792)/Can_Census1996$V473*100)
Can_Census1996 <- mutate(Can_Census1996, race.latin = Can_Census1996$V794/Can_Census1996$V473*100)
Can_Census1996 <- mutate(Can_Census1996, race.indigenous = Can_Census1996$V474/Can_Census1996$V473*100)
Can_Census1996 <- mutate(Can_Census1996, race.other = (Can_Census1996$V795 + Can_Census1996$V793)/Can_Census1996$V473*100)
Can_Census1996 <- mutate(Can_Census1996, race.mixed = Can_Census1996$V796/Can_Census1996$V473*100)
Can_Census1996 <- mutate(Can_Census1996, i.agri = (Can_Census1996$V1017 + Can_Census1996$V1018 + Can_Census1996$V1019)/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.mining = Can_Census1996$V1020/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.utilities = NA)
Can_Census1996 <- mutate(Can_Census1996, i.construction = Can_Census1996$V1022/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.manufacturing = Can_Census1996$V1021/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.wholesale = Can_Census1996$V1025/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.retail = Can_Census1996$V1026/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.transportation = Can_Census1996$V1023/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.info.culture = Can_Census1996$V1024/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.finance.insur = Can_Census1996$V1027/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.realestate = Can_Census1996$V1028/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.sci.technical = NA)
Can_Census1996 <- mutate(Can_Census1996, i.company.management = Can_Census1996$V1029/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.admin.waste = NA)
Can_Census1996 <- mutate(Can_Census1996, i.education = Can_Census1996$V1031/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.health = Can_Census1996$V1032/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.arts.entertain = NA)
Can_Census1996 <- mutate(Can_Census1996, i.food.accommo = Can_Census1996$V1033/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.other = Can_Census1996$V1034/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, i.pub.admin = Can_Census1996$V1030/Can_Census1996$V1016*100)
Can_Census1996 <- mutate(Can_Census1996, Median.Age = NA)
Can_Census1996 <- mutate(Can_Census1996, Male.Median = NA)
Can_Census1996 <- mutate(Can_Census1996, Female.Median = NA)
Can_Census1996 <- mutate(Can_Census1996, Year = 1996)
Can_Census1996$inc.0.10 <- NA
Can_Census1996$inc.10.15 <- NA
Can_Census1996$inc.15.25 <- NA
Can_Census1996$inc.25.35 <- NA
Can_Census1996$inc.35.50 <- NA
Can_Census1996$inc.50.75 <- NA
Can_Census1996$inc.75.100 <- NA
Can_Census1996$inc.100.150 <- NA
Can_Census1996$inc.150.200 <- NA
Can_Census1996$inc.200.plus <- NA
Can_Census1996 <- Can_Census1996[,c(1,126,76,123:125,77:95,127:136,96:122)]
names(Can_Census1996)[1] <- "City"
###Can Age 2001
Can_Age2001 <- mutate(Can_Age2001, m.per.f.100 = Can_Age2001$Male.Tot/Can_Age2001$Female.Tot*100)
Can_Age2001 <- mutate(Can_Age2001, Year = 2001)
Can_Age2001 <- mutate(Can_Age2001, age.Less5 = Can_Age2001[,4]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.5.9 = Can_Age2001[,10]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.10.14 = Can_Age2001[,16]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.15.19 = Can_Age2001[,22]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.20.24 = Can_Age2001[,28]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.25.29 = Can_Age2001[,34]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.30.34 = Can_Age2001[,40]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.35.39 = Can_Age2001[,46]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.40.44 = Can_Age2001[,52]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.45.49 = Can_Age2001[,58]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.50.54 = Can_Age2001[,64]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.55.59 = Can_Age2001[,70]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.60.64 = Can_Age2001[,76]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.65.69 = Can_Age2001[,82]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.70.74 = Can_Age2001[,88]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.75.79 = Can_Age2001[,94]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.80.84 = Can_Age2001[,100]/Can_Age2001[,1]*100)
Can_Age2001 <- mutate(Can_Age2001, age.Plus85 = (Can_Age2001[,106] + Can_Age2001[,112] + Can_Age2001[,118] + Can_Age2001[,124])/Can_Age2001[,1]*100)
Can_Age2001 <- Can_Age2001[,c(128,130,1,129,125,126,127,131:148)]
names(Can_Age2001)[3] <- "Total.Pop"
names(Can_Age2001)[5] <- "Median.Age"
names(Can_Age2001)[6] <- "Male.Median"
names(Can_Age2001)[7] <- "Female.Median"
Can_Age2001 <- Can_Age2001[,-3]
###Can Age 2006
Can_Age2006 <- mutate(Can_Age2006, m.per.f.100 = Can_Age2006$Male.Tot/Can_Age2006$Female.Tot*100)
Can_Age2006 <- mutate(Can_Age2006, Year = 2006)
Can_Age2006 <- mutate(Can_Age2006, age.Less5 = Can_Age2006[,9]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.5.9 = Can_Age2006[,15]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.10.14 = Can_Age2006[,21]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.15.19 = Can_Age2006[,28]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.20.24 = Can_Age2006[,34]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.25.29 = Can_Age2006[,41]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.30.34 = Can_Age2006[,47]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.35.39 = Can_Age2006[,54]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.40.44 = Can_Age2006[,60]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.45.49 = Can_Age2006[,67]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.50.54 = Can_Age2006[,73]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.55.59 = Can_Age2006[,80]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.60.64 = Can_Age2006[,86]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.65.69 = Can_Age2006[,93]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.70.74 = Can_Age2006[,99]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.75.79 = Can_Age2006[,105]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.80.84 = Can_Age2006[,112]/Can_Age2006[,1]*100)
Can_Age2006 <- mutate(Can_Age2006, age.Plus85 = (Can_Age2006[,118] + Can_Age2006[,124] + Can_Age2006[,130] + Can_Age2006[,136])/Can_Age2006[,1]*100)
Can_Age2006 <- Can_Age2006[,c(5,138,1,137,2,3,4,139:156)]
names(Can_Age2006) <- colnames(Can_Age2001)
Can_Age2006 <- Can_Age2006[,-3]
###Can Age 2011
Can_Age2011 <- mutate(Can_Age2011, m.per.f.100 = Can_Age2011$Male.Tot/Can_Age2011$Female.Tot*100)
Can_Age2011 <- mutate(Can_Age2011, Year = 2011)
Can_Age2011 <- mutate(Can_Age2011, age.Less5 = Can_Age2011[,5]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.5.9 = Can_Age2011[,11]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.10.14 = Can_Age2011[,17]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.15.19 = Can_Age2011[,24]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.20.24 = Can_Age2011[,30]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.25.29 = Can_Age2011[,37]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.30.34 = Can_Age2011[,43]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.35.39 = Can_Age2011[,50]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.40.44 = Can_Age2011[,56]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.45.49 = Can_Age2011[,63]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.50.54 = Can_Age2011[,69]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.55.59 = Can_Age2011[,76]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.60.64 = Can_Age2011[,82]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.65.69 = Can_Age2011[,89]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.70.74 = Can_Age2011[,95]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.75.79 = Can_Age2011[,101]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.80.84 = Can_Age2011[,108]/Can_Age2011[,1]*100)
Can_Age2011 <- mutate(Can_Age2011, age.Plus85 = (Can_Age2011[,114] + Can_Age2011[,120] + Can_Age2011[,126] + Can_Age2011[,132])/Can_Age2011[,1]*100)
Can_Age2011 <- Can_Age2011[,c(136,138,1,137,133,134,135,139:156)]
names(Can_Age2011) <- colnames(Can_Age2001)
Can_Age2011 <- Can_Age2011[,-3]
###USA Age 2000
Age2000 <- Age2000[,c(3,4,10, 284, 285, 286, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91, 98, 105, 112, 119, 126, 133,140)]
Age2000 <- mutate(Age2000, age.Plus85 = (Age2000[,24] + Age2000[,25]))
Age2000 <- mutate(Age2000, Year = 2000)
Age2000 <- Age2000[,c(1,27,2:23,26)]
Age2000 <- Age2000[,-3]
names(Age2000) <- colnames(Can_Age2001)
###USA Age 2010
Age2010 <- Age2010[,c(3,4,10, 284, 285, 286, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91, 98, 105, 112, 119, 126, 133,140)]
Age2010 <- mutate(Age2010, age.Plus85 = (Age2010[,24] + Age2010[,25]))
Age2010 <- mutate(Age2010, Year = 2010)
Age2010 <- Age2010[,c(1,27,2:23,26)]
Age2010 <- Age2010[,-3]
names(Age2010) <- colnames(Can_Age2001)
###USA Age 2015
Age2015 <- Age2015[,c(3,4,184, 178, 180, 182, 10, 16, 22, 28, 34, 40, 46, 52, 58, 64, 70, 76, 82, 88, 94, 100, 106, 112)]
Age2015 <- mutate(Age2015, Year = 2015)
Age2015 <- Age2015[,c(1,25,2:24)]
Age2015 <- Age2015[,-3]
names(Age2015) <- colnames(Can_Age2001)
###Income 2000 USA
Income2000 <- Income2000[,c(3, 118, 99, 101, 103,105, 107, 109, 111, 113, 115, 117)]
Income2000 <- mutate(Income2000, Year = 2000)
Income2000 <- Income2000[,c(1,13,2:12)]
names(Income2000) <- c("City", "Year", "Median.Income", "inc.0.10", "inc.10.15", "inc.15.25", "inc.25.35", "inc.35.50",
"inc.50.75", "inc.75.100", "inc.100.150", "inc.150.200", "inc.200.plus")
###Income 2010 USA
Income2010 <- Income2010[,c(3, 92, 12, 20, 28, 36, 44, 52, 60,68, 76, 84)]
Income2010 <- mutate(Income2010, Year = 2010)
Income2010 <- Income2010[,c(1,13,2:12)]
names(Income2010) <- colnames(Income2000)
###Income 2015 USA
Income2015 <- Income2015[,c(3, 92, 12, 20, 28, 36, 44, 52, 60,68, 76, 84)]
Income2015 <- mutate(Income2015, Year = 2015)
Income2015 <- Income2015[,c(1,13,2:12)]
names(Income2015) <- colnames(Income2000)
#Income CAN2001
Can_Income2001 <- Can_Income2001[,c(4,2)]
Can_Income2001$Year <- 2001
Can_Income2001 <- Can_Income2001[,c(1,3,2)]
Can_Income2001$inc.0.10 <- NA
Can_Income2001$inc.10.15 <- NA
Can_Income2001$inc.15.25 <- NA
Can_Income2001$inc.25.35 <- NA
Can_Income2001$inc.35.50 <- NA
Can_Income2001$inc.50.75 <- NA
Can_Income2001$inc.75.100 <- NA
Can_Income2001$inc.100.150 <- NA
Can_Income2001$inc.150.200 <- NA
Can_Income2001$inc.200.plus <- NA
names(Can_Income2001)[3] <- "Median.Income"
#Income CAN2011
Can_Income2011 <- mutate(Can_Income2011, inc.0.10 = (Can_Income2011[,20] + Can_Income2011[,21])/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.10.15 = Can_Income2011[,22]/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.15.25 = (Can_Income2011[,23] + (Can_Income2011[,24]/2))/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.25.35 = ((Can_Income2011[,25]/2) + (Can_Income2011[,24]/2))/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.35.50 = (Can_Income2011[,26] + (Can_Income2011[,25]/2))/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.50.75 = (Can_Income2011[,27] + (Can_Income2011[,28]/2))/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.75.100 = (Can_Income2011[,29] + (Can_Income2011[,28]/2))/Can_Income2011[,19]*100)
Can_Income2011 <- mutate(Can_Income2011, inc.100.150 = NA)
Can_Income2011 <- mutate(Can_Income2011, inc.150.200 = NA)
Can_Income2011 <- mutate(Can_Income2011, inc.inc.200.plus = NA)
Can_Income2011 <- mutate(Can_Income2011, Year = 2011)
Can_Income2011 <- Can_Income2011[,c(1,45,33,35:44)]
names(Can_Income2011) <- colnames(Income2000)
###Industry 2015 USA:
Industry2015 <- Industry2015[,c(3,4,24,34,104,44,54,64,74,94,114,134, 144, 164, 174, 184, 204, 214, 234, 244, 254, 264)]
Industry2015 <- mutate(Industry2015, i.agri = Industry2015[,3]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.mining = Industry2015[,4]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.utilities = Industry2015[,5]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.construction = Industry2015[,6]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.manufacturing = Industry2015[,7]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.wholesale = Industry2015[,8]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.retail = Industry2015[,9]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.transportation = Industry2015[,10]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.info.culture = Industry2015[,11]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.finance.insur = Industry2015[,12]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.realestate = Industry2015[,13]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.sci.technical = Industry2015[,14]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.company.management = Industry2015[,15]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.admin.waste = Industry2015[,16]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.education = Industry2015[,17]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.health = Industry2015[,18]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.arts.entertain = Industry2015[,19]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.food.accommo = Industry2015[,20]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.other = Industry2015[,21]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, i.pub.admin = Industry2015[,22]/Industry2015[,2]*100)
Industry2015 <- mutate(Industry2015, Year = 2015)
Industry2015 <- Industry2015[,c(1,43,23:42)]
names(Industry2015)[1] <- "City"
###CAN Industry 2001
Can_Industry2001 <- mutate(Can_Industry2001, i.agri = Can_Industry2001[,4]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.mining = Can_Industry2001[,5]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.utilities = Can_Industry2001[,6]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.construction = Can_Industry2001[,7]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.manufacturing = Can_Industry2001[,8]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.wholesale = Can_Industry2001[,9]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.retail = Can_Industry2001[,10]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.transportation = Can_Industry2001[,11]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.info.culture = Can_Industry2001[,12]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.finance.insur = Can_Industry2001[,13]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.realestate = Can_Industry2001[,14]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.sci.technical = Can_Industry2001[,15]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.company.management = Can_Industry2001[,16]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.admin.waste = Can_Industry2001[,17]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.education = Can_Industry2001[,18]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.health = Can_Industry2001[,19]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.arts.entertain = Can_Industry2001[,20]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.food.accommo = Can_Industry2001[,21]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.other = Can_Industry2001[,22]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, i.pub.admin = Can_Industry2001[,23]/Can_Industry2001[,1]*100)
Can_Industry2001 <- mutate(Can_Industry2001, Year = 2001)
Can_Industry2001 <- Can_Industry2001[,c(24,45,25:44)]
names(Can_Industry2001)[1] <- "City"
###CAN Industry 2011
Can_Industry2011 <- Can_Industry2011[,c(1, 2, 3, 17, 26, 31, 45, 153, 189, 229, 270, 289, 306, 318, 329, 332, 346, 355, 373, 386, 396, 415)]
Can_Industry2011 <- mutate(Can_Industry2011, i.agri = Can_Industry2011[,3]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.mining = Can_Industry2011[,4]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.utilities = Can_Industry2011[,5]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.construction = Can_Industry2011[,6]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.manufacturing = Can_Industry2011[,7]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.wholesale = Can_Industry2011[,8]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.retail = Can_Industry2011[,9]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.transportation = Can_Industry2011[,10]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.info.culture = Can_Industry2011[,11]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.finance.insur = Can_Industry2011[,12]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.realestate = Can_Industry2011[,13]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.sci.technical = Can_Industry2011[,14]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.company.management = Can_Industry2011[,15]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.admin.waste = Can_Industry2011[,16]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.education = Can_Industry2011[,17]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.health = Can_Industry2011[,18]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.arts.entertain = Can_Industry2011[,19]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.food.accommo = Can_Industry2011[,20]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.other = Can_Industry2011[,21]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, i.pub.admin = Can_Industry2011[,22]/Can_Industry2011[,2]*100)
Can_Industry2011 <- mutate(Can_Industry2011, Year = 2011)
Can_Industry2011 <- Can_Industry2011[,c(1,43,23:42)]
names(Can_Industry2011)[1] <- "City"
###Race USA 2000
Race2000 <- Race2000[,c(3, 6, 9, 11, 23, 83, 13, 53, 55)]
Race2000 <- Race2000[,-2]
Race2000$Year <- 2000
Race2000 <- Race2000[,c(1,9,2:8)]
names(Race2000) <- c("City", "Year", "race.white", "race.black", "race.asian", "race.latin", "race.indigenous", "race.other", "race.mixed")
###Race USA 2010
Race2010 <- Race2010[,c(3, 9, 11, 13, 57, 23, 27, 29)]
Race2010$Year <- 2010
Race2010 <- Race2010[,c(1,9,2,3,6,5,4,7,8)]
names(Race2010) <- c("City", "Year", "race.white", "race.black", "race.asian", "race.latin", "race.indigenous", "race.other", "race.mixed")
###Race USA 2015
Race2015 <- Race2015[,c(3, 130, 134, 158, 266, 138, 210, 214)]
Race2015$Year <- 2015
Race2015 <- Race2015[,c(1,9,2:8)]
names(Race2015) <- colnames(Race2000)
##Race CAN 2001
Can_Race2001 <- mutate(Can_Race2001, race.white = Can_Race2001[,4]/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.black = Can_Race2001[,7]/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.asian = (Can_Race2001[,5] + Can_Race2001[,6] + Can_Race2001[,8] + Can_Race2001[,10] + Can_Race2001[,13] + Can_Race2001[,14])/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.latin = Can_Race2001[,9]/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.indigenous = Can_Race2001[,29]/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.other = (Can_Race2001[,11] + Can_Race2001[,12] + Can_Race2001[,15])/Can_Race2001[,2]*100)
Can_Race2001 <- mutate(Can_Race2001, race.mixed = Can_Race2001[,16]/Can_Race2001[,2]*100)
Can_Race2001$Year <- 2001
Can_Race2001 <- Can_Race2001[,c(1,37,30:36)]
names(Can_Race2001)[1] <- "City"
##Race CAN 2011
Can_Race2011 <- mutate(Can_Race2011, race.white = (Can_Race2011[,180] - Can_Race2011[,182])/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.black = Can_Race2011[,170]/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.asian = (Can_Race2011[,168] + Can_Race2011[,169] + Can_Race2011[,171] + Can_Race2011[,174] + Can_Race2011[,176] + Can_Race2011[,177])/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.latin = Can_Race2011[,172]/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.indigenous = Can_Race2011[,182]/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.other = (Can_Race2011[,178] + Can_Race2011[,175] + Can_Race2011[,173])/Can_Race2011[,166]*100)
Can_Race2011 <- mutate(Can_Race2011, race.mixed = Can_Race2011[,179]/Can_Race2011[,166]*100)
Can_Race2011$Year <- 2011
Can_Race2011 <- Can_Race2011[,c(1,336,329:335)]
names(Can_Race2011)[1] <- "City"
####CAN Census 2016
Can_Census2016 <- mutate(Can_Census2016, age.Less5 = Can_Census2016[,10]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.5.9 = Can_Census2016[,11]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.10.14 = Can_Census2016[,12]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.15.19 = Can_Census2016[,13]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.20.24 = Can_Census2016[,14]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.25.29 = Can_Census2016[,15]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.30.34 = Can_Census2016[,16]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.35.39 = Can_Census2016[,17]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.40.44 = Can_Census2016[,18]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.45.49 = Can_Census2016[,19]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.50.54 = Can_Census2016[,20]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.55.59 = Can_Census2016[,21]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.60.64 = Can_Census2016[,22]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.65.69 = Can_Census2016[,23]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.70.74 = Can_Census2016[,24]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.75.79 = Can_Census2016[,25]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.80.84 = Can_Census2016[,26]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, age.Plus85 = Can_Census2016[,27]/Can_Census2016[,3]*100)
Can_Census2016 <- mutate(Can_Census2016, m.per.f.100 = (Can_Census2016[,4] / Can_Census2016[,5])*100)
Can_Census2016$Year <- 2016
Can_Census2016 <- Can_Census2016[,-c(3,4,5)]
### US Census 1990
Census_1990 <- mutate(Census_1990, age.Less5 = Census_1990[,8]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.5.9 = Census_1990[,9]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.10.14 = Census_1990[,10]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.15.19 = Census_1990[,11]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.20.24 = Census_1990[,12]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.25.29 = Census_1990[,13]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.30.34 = Census_1990[,14]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.35.39 = Census_1990[,15]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.40.44 = Census_1990[,16]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.45.49 = Census_1990[,17]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.50.54 = Census_1990[,18]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.55.59 = Census_1990[,19]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.60.64 = Census_1990[,20]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.65.69 = Census_1990[,21]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.70.74 = Census_1990[,22]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.75.79 = Census_1990[,23]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.80.84 = Census_1990[,24]/Census_1990[,7]*100)
Census_1990 <- mutate(Census_1990, age.Plus85 = Census_1990[,25]/Census_1990[,7]*100)
Census_1990 <- Census_1990[,-7]
Now that I have my data compiled into uniform sets, it is time for some cleaning. I will: - Make sure the city names match the names in the majorfour dataset for when I merge - Convert CAD to USD - Convert prices in previous years to one uniform price based on inflation
##City Name Change
city.frame <- as.data.frame(table(majorfour$City))
city.frame <- city.frame[1]
names(city.frame)[1] <- "City"
citylist2 <- as.character(city.frame$City)
citylist2 <- citylist2[-c(6, 15, 17, 27, 35, 40, 51,52,54)]
city.total <- data.frame(citylist2)
city.nonash <- data.frame(city.total[-24,])
Age2000 <- cbind(city.nonash, Age2000)
Age2000 <- Age2000[,-2]
names(Age2000)[1] <- "City"
Age2010 <- cbind(city.total, Age2010)
Age2010 <- Age2010[,-2]
names(Age2010)[1] <- "City"
Age2015 <- cbind(city.total, Age2015)
Age2015 <- Age2015[,-2]
names(Age2015)[1] <- "City"
Income2000 <- cbind(city.nonash, Income2000)
Income2000 <- Income2000[,-2]
names(Income2000)[1] <- "City"
Income2010 <- cbind(city.total, Income2010)
Income2010 <- Income2010[,-2]
names(Income2010)[1] <- "City"
Income2015 <- cbind(city.total, Income2015)
Income2015 <- Income2015[,-2]
names(Income2015)[1] <- "City"
Industry2015 <- cbind(city.total, Industry2015)
Industry2015 <- Industry2015[,-2]
names(Industry2015)[1] <- "City"
Race2000 <- cbind(city.nonash, Race2000)
Race2000 <- Race2000[,-2]
names(Race2000)[1] <- "City"
Race2010 <- cbind(city.total, Race2010)
Race2010 <- Race2010[,-2]
names(Race2010)[1] <- "City"
Race2015 <- cbind(city.total, Race2015)
Race2015 <- Race2015[,-2]
names(Race2015)[1] <- "City"
###Converting CAD to USD
conversion <- read.csv("Conversion_USCAD.csv", header = T)
Can_Income2001$Median.Income <- Can_Income2001$Median.Income *0.646223
Can_Income2011$Median.Income <- Can_Income2011$Median.Income *1.011464
Can_Census1996$Median.Income <- Can_Census1996$Median.Income *0.733274
Can_Census2016$Median.Income <- Can_Census2016$Median.Income* 0.782992
###Inflation conversion
Income2000 <- mutate(Income2000, adj.median.income = Income2000$Median.Income * 1.42)
Income2000 <- Income2000[,c(1:3, 14, 4:13)]
Income2010 <- mutate(Income2010, adj.median.income = Income2010$Median.Income *1.12)
Income2010 <- Income2010[,c(1:3, 14, 4:13)]
Income2015 <- mutate(Income2015, adj.median.income = Income2015$Median.Income *1.03)
Income2015 <- Income2015[,c(1:3, 14, 4:13)]
Census_1990 <- mutate(Census_1990, adj.median.income = Census_1990$Median.Income *1.87)
Census_1990 <- Census_1990[,c(1:25, 63, 26:62)]
Can_Income2001 <- mutate(Can_Income2001, adj.median.income = Can_Income2001$Median.Income * 1.38)
Can_Income2001 <- Can_Income2001[,c(1:3, 14, 4:13)]
Can_Income2011 <- mutate(Can_Income2011, adj.median.income = Can_Income2011$Median.Income * 1.09)
Can_Income2011 <- Can_Income2011[,c(1:3, 14, 4:13)]
Can_Census1996 <- mutate(Can_Census1996, adj.median.income = Can_Census1996$Median.Income * 1.56)
Can_Census1996 <- Can_Census1996[,c(1:25, 63, 26:62)]
Can_Census2016 <- mutate(Can_Census2016, adj.median.income = Can_Census2016$Median.Income * 1.03)
Can_Census2016 <- Can_Census2016[,c(1:25, 63, 26:62)]
###Inflation for the ticket Prices
inflation <- read.csv("inflation.csv", header = T)
majorfour <- merge(majorfour, inflation, by = "Year", all.x = T)
majorfour <- mutate(majorfour, adj.ticket.price = majorfour$Ave.Tick*majorfour$Conversion)
majorfour <- majorfour[,c(1:11,23,12:14,19:21)]
My next step is to bind each individual demographic dataset together for each Census. Creating a dataset that contains all the variables for each Census:
Can_Census2001 <- merge(Can_Age2001, Can_Income2001, by = c("City", "Year"))
Can_Census2001 <- merge(Can_Census2001, Can_Race2001, by = c("City", "Year"))
Can_Census2001 <- merge(Can_Census2001, Can_Industry2001, by = c("City", "Year"))
Can_Census2011 <- merge(Can_Age2011, Can_Income2011, by = c("City", "Year"))
Can_Census2011 <- merge(Can_Census2011, Can_Race2011, by = c("City", "Year"))
Can_Census2011 <- merge(Can_Census2011, Can_Industry2011, by = c("City", "Year"))
US_Census2000 <- merge(Age2000, Income2000, by = c("City", "Year"))
US_Census2000 <- merge(US_Census2000, Race2000, by = c("City", "Year"))
nashville <- c("Nashville", 2000, rep(NA, 41))
US_Census2000 <- rbind(US_Census2000, nashville)
US_Census2000 <- US_Census2000[c(1:23,45,24:44),]
US_Census2010 <- merge(Age2010, Income2010, by = c("City", "Year"))
US_Census2010 <- merge(US_Census2010, Race2010, by = c("City", "Year"))
US_Census2015 <- merge(Age2015, Income2015, by = c("City", "Year"))
US_Census2015 <- merge(US_Census2015, Race2015, by = c("City", "Year"))
US_Census2015 <- merge(US_Census2015, Industry2015, by = c("City", "Year"))
head(US_Census2015)
## City Year m.per.f.100 Median.Age Male.Median Female.Median
## 1 Anaheim 2015 99.6 33.6 32.3 34.8
## 2 Atlanta 2015 97.1 33.4 33.0 33.7
## 3 Baltimore 2015 89.1 34.6 33.3 35.9
## 4 Boston 2015 92.1 31.6 30.9 32.1
## 5 Buffalo 2015 91.3 33.1 31.5 34.6
## 6 Charlotte 2015 92.2 33.7 32.8 34.7
## age.Less5 age.5.9 age.10.14 age.15.19 age.20.24 age.25.29 age.30.34
## 1 7.2 7.0 7.0 7.2 7.8 8.2 7.6
## 2 6.1 5.1 5.0 6.4 10.4 10.6 9.4
## 3 6.7 5.9 5.4 6.2 8.4 9.8 8.3
## 4 5.4 4.3 4.2 7.4 11.9 13.3 9.5
## 5 6.6 6.1 6.2 7.4 9.7 9.2 7.3
## 6 7.2 7.1 6.5 6.4 7.3 8.8 8.6
## age.35.39 age.40.44 age.45.49 age.50.54 age.55.59 age.60.64 age.65.69
## 1 7.0 7.2 6.8 6.5 5.6 4.6 3.3
## 2 7.5 7.0 6.3 6.0 5.1 4.7 3.4
## 3 6.2 5.9 6.3 7.0 6.4 5.6 4.0
## 4 6.6 5.9 5.7 5.6 5.2 4.5 3.3
## 5 5.7 5.7 5.7 6.5 6.8 5.1 3.5
## 6 7.8 7.6 6.8 6.4 5.6 4.6 3.2
## age.70.74 age.75.79 age.80.84 age.Plus85 Median.Income adj.median.income
## 1 2.5 1.7 1.3 1.4 60752 62574.56
## 2 2.7 1.8 1.3 1.3 47527 48952.81
## 3 2.7 2.1 1.6 1.5 42241 43508.23
## 4 2.5 1.7 1.5 1.6 55777 57450.31
## 5 2.7 2.1 1.9 1.6 31918 32875.54
## 6 2.2 1.6 1.2 1.1 53637 55246.11
## inc.0.10 inc.10.15 inc.15.25 inc.25.35 inc.35.50 inc.50.75 inc.75.100
## 1 5.1 5.1 8.9 9.1 12.7 18.0 13.5
## 2 12.8 6.3 11.1 9.4 12.0 14.9 9.3
## 3 13.1 7.5 11.6 11.1 13.0 16.7 10.0
## 4 12.0 7.3 9.3 7.2 10.2 14.9 10.4
## 5 16.6 9.5 15.0 12.3 13.4 15.1 8.2
## 6 6.9 4.8 10.0 10.9 14.1 17.8 11.5
## inc.100.150 inc.150.200 inc.200.plus race.white race.black race.asian
## 1 15.5 6.8 5.2 68.2 2.3 16.0
## 2 10.5 4.9 8.8 40.0 52.9 3.9
## 3 9.9 3.7 3.4 30.3 62.8 2.6
## 4 14.1 6.5 8.1 53.0 25.2 9.3
## 5 6.7 1.7 1.4 48.5 37.3 4.4
## 6 12.4 4.9 6.7 51.6 34.9 5.8
## race.latin race.indigenous race.other race.mixed i.agri i.mining
## 1 53.1 0.3 9.6 3.2 0.37165859 0.048902446
## 2 5.0 0.2 1.0 1.9 0.14068773 0.017761242
## 3 4.6 0.3 1.7 2.3 0.13029196 0.023253239
## 4 18.8 0.4 7.5 4.5 0.10680266 0.003762695
## 5 10.8 0.4 5.5 3.8 0.06069635 0.083687395
## 6 13.5 0.3 4.6 2.7 0.18901332 0.034117029
## i.utilities i.construction i.manufacturing i.wholesale i.retail
## 1 0.3594330 7.421557 13.757481 3.953763 11.161983
## 2 0.4954452 2.938551 4.937625 2.657175 9.692964
## 3 0.6167645 4.863618 4.705274 1.924113 9.588824
## 4 0.4350255 3.338090 4.384987 1.448059 8.396600
## 5 0.3853299 3.317148 8.880060 2.026890 10.466442
## 6 0.8947627 6.245658 7.642214 3.202767 10.749852
## i.transportation i.info.culture i.finance.insur i.realestate
## 1 2.961043 1.844845 4.700136 2.121144
## 2 4.869852 3.832222 5.524681 2.785243
## 3 4.479017 1.940723 3.342930 2.161075
## 4 2.719850 2.426649 7.193405 1.964706
## 5 4.110798 1.711453 5.206092 1.663632
## 6 4.374700 2.802577 10.781977 2.273638
## i.sci.technical i.company.management i.admin.waste i.education i.health
## 1 5.808388 0.06418446 6.472239 6.299246 11.86985
## 2 14.939542 0.12432870 4.883874 11.983697 9.95097
## 3 7.135422 0.03654080 5.076588 12.747204 18.58524
## 4 11.675065 0.06657077 4.454163 13.356411 18.42274
## 5 5.236440 0.02850889 5.748680 10.785558 20.07486
## 6 8.374112 0.14045259 5.283657 7.170553 11.74348
## i.arts.entertain i.food.accommo i.other i.pub.admin
## 1 3.738592 9.136199 5.419614 2.489746
## 2 2.642218 8.821261 4.630075 4.131826
## 3 1.939246 6.962315 4.911601 8.829956
## 4 2.081060 8.549134 4.605829 4.371094
## 5 2.285310 9.826372 4.091486 4.010557
## 6 2.244751 8.873416 4.978347 1.999955
Census are not tallied every year. I perform some interpolation for the years in between to help create a complete data set. For the any years outside of my max or min census years, their missing values will unfortunately be kept as NA’s.
###interpolation of demographic information for Canada
all.can.census <- bind_rows(Can_Census1996, Can_Census2001, Can_Census2011, Can_Census2016)
remain.years <- data.frame(canlist, rep(c(1997:2000,2002:2010,2012:2015), each = 7))
names(remain.years) <- c("City", "Year")
all.can.census <- bind_rows(all.can.census, remain.years)
all.can.census <- all.can.census[order(all.can.census[,1], all.can.census[,2]), ]
all.can.census[,c(3,7:26)] <- lapply(all.can.census[,c(3,7:26)], na.approx)
can.setp.2 <- all.can.census[,c(1,2,4:6)]
can.setp.2 <- subset(can.setp.2, Year > 2000)
can.setp.2[,c(3:5)] <- na.approx(can.setp.2[,c(3:5)])
can.setp.3 <- all.can.census[,c(1,2,37:45,47:54,56,58,59,61:63)]
can.setp.3 <- subset(can.setp.3, Year < 2012)
can.setp.3[,c(3:25)] <- na.approx(can.setp.3[,c(3:25)])
can.setp.4 <- all.can.census[,c(1,2,46,55,57,60)]
can.setp.4 <- subset(can.setp.4, Year > 2000 & Year < 2012)
can.setp.4[,c(3:6)] <- na.approx(can.setp.4[,c(3:6)])
all.can.census <- all.can.census[,c(1,2,3,7:36 )]
all.can.census <- merge(all.can.census, can.setp.2, all.x = T, by = c("City", "Year"))
all.can.census <- merge(all.can.census, can.setp.3, all.x = T, by = c("City", "Year"))
all.can.census <- merge(all.can.census, can.setp.4, all.x = T, by = c("City", "Year"))
all.can.census <- all.can.census[,c(1:3,34:36,4:33,37:45,60,46:53,61,54,62,55,56,63,57:59)]
outside.census.years <- data.frame(canlist, rep(c(1991:1995), each = 7))
names(outside.census.years) <- c("City", "Year")
all.can.census <- bind_rows(all.can.census, outside.census.years)
all.can.census <- all.can.census[order(all.can.census[,1], all.can.census[,2]), ]
###interpolation of demographic information for USA
citylist <- as.character(city.frame$City)
citylist <- citylist[-c(6, 15,17, 27, 35, 40, 51,52,54)]
US_Census2000[,-1] <- lapply(US_Census2000[,-1], as.numeric)
US_Census2010[,-1] <- lapply(US_Census2010[,-1], as.numeric)
US_Census2015[,-1] <- lapply(US_Census2015[,-1], as.numeric)
all.us.census <- bind_rows(Census_1990,US_Census2000, US_Census2010, US_Census2015)
remain.years.us <- data.frame(citylist, rep(c(1991:1999, 2001:2009,2011:2014), each = 45))
names(remain.years.us) <- c("City", "Year")
remain.years.us$Year <- as.numeric(remain.years.us$Year)
all.us.census <- bind_rows(all.us.census, remain.years.us)
all.us.census <- all.us.census[order(all.us.census[,1], all.us.census[,2]), ]
all.us.census[,c(3:26,37:42)] <- lapply(all.us.census[,c(3:26,37:42)], na.approx)
us.setp.2 <- all.us.census[,c(1,2,27:36,43)]
us.setp.2 <- subset(us.setp.2, Year > 1999)
us.setp.2[,c(3:13)] <- na.approx(us.setp.2[,c(3:13)])
all.us.census <- all.us.census[,c(1,2,3:26,37:42,44:63)]
all.us.census <- merge(all.us.census, us.setp.2, all.x = T, by = c("City", "Year"))
all.us.census <- all.us.census[,c(1:26,53:62,27:32,63,33:52)]
outside.census.years.us <- data.frame(citylist, rep(2016, each = 45))
names(outside.census.years.us) <- c("City", "Year")
all.us.census <- bind_rows(all.us.census, outside.census.years.us)
all.us.census <- all.us.census[order(all.us.census[,1], all.us.census[,2]), ]
all.us.census <- subset(all.us.census, all.us.census$City != "Hartford")
###American Industry Calculation - Giving each industry the 2015 values because no other Industry information was found
use <- subset(all.us.census, all.us.census$Year == 2015)
use <- use[,c(1,2,44:63)]
use.duplicated <- expandRows(use, count = 27, count.is.col = F)
use.duplicated <- use.duplicated[,-c(1,2)]
all.us.census <- cbind(all.us.census, use.duplicated)
all.us.census <- all.us.census[,-c(44:63)]
Finally I can bind all the info together and merge the demographic variables to the majorfour dataset
demographics <- rbind(all.us.census, all.can.census)
majorfour.dem <- merge(majorfour, demographics, all.x = T, by = c("City", "Year"))
With the demographic variables attached to the data on each team by year, I can now start creating some new variables. These variables will help me calculate the correlation between year by year variance in winning with year by year variance in attendance. They will also help me determine if there are any significant variables that could predict what type of city would have these high or low correlations.
jump.frame <- majorfour.dem
###Calculate the percentage of capacity to population
jump.frame <- mutate(jump.frame, pct.capTOpop = jump.frame$Capacity/jump.frame$Pop *100)
###Calculate the zscore of the pct.capTOpop
jump.frame <- mutate(jump.frame, z.pct.cap.pop = scale(jump.frame$pct.capTOpop))
jump.frame$z.pct.cap.pop <- as.numeric(jump.frame$z.pct.cap.pop)
###Calculate the percentage of attendance to population
jump.frame <- mutate(jump.frame, pct.attTOpop = jump.frame$Ave.Attendance/jump.frame$Pop *100)
###Calculate the zscore of the pct.capTOpop
jump.frame <- mutate(jump.frame, z.pct.att.pop = scale(jump.frame$pct.attTOpop))
jump.frame$z.pct.att.pop <- as.numeric(jump.frame$z.pct.att.pop)
###Calculate the percentage of ticket price to income
jump.frame <- mutate(jump.frame, pct.tick.income = jump.frame$adj.ticket.price/jump.frame$adj.median.income*100)
###Calculate the percentage of payroll vs salary cap
jump.frame <- mutate(jump.frame, payroll.pct = jump.frame$Payroll/jump.frame$Salary.Cap*100)
###Calcule the salary cap pct by winning pct
jump.frame <- mutate(jump.frame, winning.per = jump.frame$Wpct/jump.frame$payroll.pct*100)
### Calculating the 1 year difference in ticket price
diff.func.tick <- function (t,s,y) {
y2 <- y-1
diff.att.2 <- jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
diff.att.2 <- diff.att.2/jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]*100
result <- diff.att.2
}
jump.frame$tickprice.diff.1 <- mapply(diff.func.tick, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$tickprice.diff.1 <- as.numeric(jump.frame$tickprice.diff.1)
jump.frame$tickprice.diff.1 <- round(jump.frame$tickprice.diff.1, digits = 4)
### Calculating the 2 year difference in ticket price
diff.func.tick.2 <- function (t,s,y) {
y2 <- y-2
diff.att.2 <- jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
diff.att.2 <- diff.att.2/jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]*100
result <- diff.att.2
}
jump.frame$tickprice.diff.2 <- mapply(diff.func.tick.2, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$tickprice.diff.2 <- as.numeric(jump.frame$tickprice.diff.2)
jump.frame$tickprice.diff.2 <- round(jump.frame$tickprice.diff.2, digits = 4)
### Calculating the 3 year difference in ticket price
diff.func.tick.3 <- function (t,s,y) {
y2 <- y-3
diff.att.2 <- jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
diff.att.2 <- diff.att.2/jump.frame$adj.ticket.price[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]*100
result <- diff.att.2
}
jump.frame$tickprice.diff.3 <- mapply(diff.func.tick.3, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$tickprice.diff.3 <- as.numeric(jump.frame$tickprice.diff.3)
jump.frame$tickprice.diff.3 <- round(jump.frame$tickprice.diff.3, digits = 4)
### Calculating the 1 year difference in attendance
diff.func.2 <- function (t,s,y) {
y2 <- y-1
diff.att.2 <- jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
result <- diff.att.2
}
jump.frame$att.diff.control <- mapply(diff.func.2, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$att.diff.control <- as.numeric(jump.frame$att.diff.control)
jump.frame$att.diff.control <- jump.frame$att.diff.control*100
jump.frame$att.diff.control <- round(jump.frame$att.diff.control, digits = 4)
### Calculating the 2 year difference in attendance
diff.func.3 <- function (t,s,y) {
y2 <- y-2
diff.att.3 <- jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
result <- diff.att.3
}
jump.frame$att.diff.control.2 <- mapply(diff.func.3, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$att.diff.control.2 <- as.numeric(jump.frame$att.diff.control.2)
jump.frame$att.diff.control.2 <- jump.frame$att.diff.control.2*100
jump.frame$att.diff.control.2 <- round(jump.frame$att.diff.control.2, digits = 4)
### Calculating the 3 year difference in attendance
diff.func.4 <- function (t,s,y) {
y2 <- y-3
diff.att.3 <- jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y] - jump.frame$att.pct[jump.frame$Team == t & jump.frame$Venue == s & jump.frame$Year == y2]
result <- diff.att.3
}
jump.frame$att.diff.control.3 <- mapply(diff.func.4, t = jump.frame$Team, s= jump.frame$Venue, y= jump.frame$Year)
jump.frame$att.diff.control.3 <- as.numeric(jump.frame$att.diff.control.3)
jump.frame$att.diff.control.3 <- jump.frame$att.diff.control.3*100
jump.frame$att.diff.control.3 <- round(jump.frame$att.diff.control.3, digits = 4)
### Calculating the 1 year difference in winning percentage
diff.func.win <- function (t,y) {
y2 <- y-1
diff.win <- jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y] - jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y2]
result <- diff.win
}
jump.frame$win.diff <- mapply(diff.func.win, t = jump.frame$Team, y= jump.frame$Year)
jump.frame$win.diff <- as.numeric(jump.frame$win.diff)
jump.frame$win.diff <- jump.frame$win.diff*100
### Calculating the 2 year difference in winning percentage
diff.func.win.2 <- function (t,y) {
y2 <- y-2
diff.win <- jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y] - jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y2]
result <- diff.win
}
jump.frame$win.diff.2 <- mapply(diff.func.win.2, t = jump.frame$Team, y= jump.frame$Year)
jump.frame$win.diff.2 <- as.numeric(jump.frame$win.diff.2)
jump.frame$win.diff.2 <- jump.frame$win.diff.2*100
### Calculating the 3 year difference in winning percentage
diff.func.win.3 <- function (t,y) {
y2 <- y-3
diff.win <- jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y] - jump.frame$Wpct[jump.frame$Team == t & jump.frame$Year == y2]
result <- diff.win
}
jump.frame$win.diff.3 <- mapply(diff.func.win.3, t = jump.frame$Team, y= jump.frame$Year)
jump.frame$win.diff.3 <- as.numeric(jump.frame$win.diff.3)
jump.frame$win.diff.3 <- jump.frame$win.diff.3*100
I need to standardize the winning pct jumps between each league. It is much easier for a football team to have a 70% increase or decrease in winning percentage year over year because they only play 16 games a year. Compare that to an MLB team who plays 162 - you would see a lot less variance in winning percentage in that league.
mlb.mean.win <- mean(jump.frame$win.diff[jump.frame$League == "MLB"], na.rm = T)
nba.mean.win <- mean(jump.frame$win.diff[jump.frame$League == "NBA"], na.rm = T)
nhl.mean.win <- mean(jump.frame$win.diff[jump.frame$League == "NHL"], na.rm = T)
nfl.mean.win <- mean(jump.frame$win.diff[jump.frame$League == "NFL"], na.rm = T)
mlb.sd.win <- sd(jump.frame$win.diff[jump.frame$League == "MLB"], na.rm = T)
nba.sd.win <- sd(jump.frame$win.diff[jump.frame$League == "NBA"], na.rm = T)
nhl.sd.win <- sd(jump.frame$win.diff[jump.frame$League == "NHL"], na.rm = T)
nfl.sd.win <- sd(jump.frame$win.diff[jump.frame$League == "NFL"], na.rm = T)
jump.frame$mean.win <- ifelse(jump.frame$League == "MLB", mlb.mean.win,
ifelse(jump.frame$League == "NBA", nba.mean.win,
ifelse(jump.frame$League == "NHL", nhl.mean.win,
nfl.mean.win)))
jump.frame$sd.win <- ifelse(jump.frame$League == "MLB", mlb.sd.win,
ifelse(jump.frame$League == "NBA", nba.sd.win,
ifelse(jump.frame$League == "NHL", nhl.sd.win,
nfl.sd.win)))
jump.frame <- mutate(jump.frame, z.win.diff = (win.diff-mean.win)/sd.win)
jump.frame <- subset(jump.frame, select = -c(mean.win, sd.win))
###Standardizing the percentage jump in winning for the 2 year span
mlb.mean.win.2 <- mean(jump.frame$win.diff.2[jump.frame$League == "MLB"], na.rm = T)
nba.mean.win.2 <- mean(jump.frame$win.diff.2[jump.frame$League == "NBA"], na.rm = T)
nhl.mean.win.2 <- mean(jump.frame$win.diff.2[jump.frame$League == "NHL"], na.rm = T)
nfl.mean.win.2 <- mean(jump.frame$win.diff.2[jump.frame$League == "NFL"], na.rm = T)
mlb.sd.win.2 <- sd(jump.frame$win.diff.2[jump.frame$League == "MLB"], na.rm = T)
nba.sd.win.2 <- sd(jump.frame$win.diff.2[jump.frame$League == "NBA"], na.rm = T)
nhl.sd.win.2 <- sd(jump.frame$win.diff.2[jump.frame$League == "NHL"], na.rm = T)
nfl.sd.win.2 <- sd(jump.frame$win.diff.2[jump.frame$League == "NFL"], na.rm = T)
jump.frame$mean.win.2 <- ifelse(jump.frame$League == "MLB", mlb.mean.win.2,
ifelse(jump.frame$League == "NBA", nba.mean.win.2,
ifelse(jump.frame$League == "NHL", nhl.mean.win.2,
nfl.mean.win.2)))
jump.frame$sd.win.2 <- ifelse(jump.frame$League == "MLB", mlb.sd.win.2,
ifelse(jump.frame$League == "NBA", nba.sd.win.2,
ifelse(jump.frame$League == "NHL", nhl.sd.win.2,
nfl.sd.win.2)))
jump.frame <- mutate(jump.frame, z.win.diff.2 = (win.diff.2-mean.win.2)/sd.win.2)
jump.frame <- subset(jump.frame, select = -c(mean.win.2, sd.win.2))
###Standardizing the percentage jump in winning for the 3 year span
mlb.mean.win.3 <- mean(jump.frame$win.diff.3[jump.frame$League == "MLB"], na.rm = T)
nba.mean.win.3 <- mean(jump.frame$win.diff.3[jump.frame$League == "NBA"], na.rm = T)
nhl.mean.win.3 <- mean(jump.frame$win.diff.3[jump.frame$League == "NHL"], na.rm = T)
nfl.mean.win.3 <- mean(jump.frame$win.diff.3[jump.frame$League == "NFL"], na.rm = T)
mlb.sd.win.3 <- sd(jump.frame$win.diff.3[jump.frame$League == "MLB"], na.rm = T)
nba.sd.win.3 <- sd(jump.frame$win.diff.3[jump.frame$League == "NBA"], na.rm = T)
nhl.sd.win.3 <- sd(jump.frame$win.diff.3[jump.frame$League == "NHL"], na.rm = T)
nfl.sd.win.3 <- sd(jump.frame$win.diff.3[jump.frame$League == "NFL"], na.rm = T)
jump.frame$mean.win.3 <- ifelse(jump.frame$League == "MLB", mlb.mean.win.3,
ifelse(jump.frame$League == "NBA", nba.mean.win.3,
ifelse(jump.frame$League == "NHL", nhl.mean.win.3,
nfl.mean.win.3)))
jump.frame$sd.win.3 <- ifelse(jump.frame$League == "MLB", mlb.sd.win.3,
ifelse(jump.frame$League == "NBA", nba.sd.win.3,
ifelse(jump.frame$League == "NHL", nhl.sd.win.3,
nfl.sd.win.3)))
jump.frame <- mutate(jump.frame, z.win.diff.3 = (win.diff.3-mean.win.3)/sd.win.3)
jump.frame <- subset(jump.frame, select = -c(mean.win.3, sd.win.3))
Is this standardization necessary for attendance. If we look at the spread of attendance, I’d say it is quite uniform between the 4 leagues:
plot_ly(jump.frame, x = jump.frame$Team, y = jump.frame$att.diff.control , color = jump.frame$League,
type = 'scatter', mode = 'marker',
hoverinfo = 'text', text = jump.frame$Team)
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
## A marker object has been specified, but markers is not in the mode
## Adding markers to the mode...
We can now calculate correlation. Is the dataset parametric? The variables look normal however we have too many missing variables. I have decided to stick with the Spearman correlation.
#Test for Normality
hist(jump.frame$z.win.diff)
hist(jump.frame$att.diff.control)
shapiro.test(jump.frame$z.win.diff)
##
## Shapiro-Wilk normality test
##
## data: jump.frame$z.win.diff
## W = 0.99903, p-value = 0.1137
shapiro.test(jump.frame$att.diff.control)
##
## Shapiro-Wilk normality test
##
## data: jump.frame$att.diff.control
## W = 0.9267, p-value < 2.2e-16
qqnorm(jump.frame$z.win.diff)
qqnorm(jump.frame$att.diff.control)
norm.test <- subset(jump.frame, select = c("z.win.diff", "att.diff.control"))
sum(complete.cases(norm.test))/3067*100
## [1] 89.37072
I have decided to only use what I consider ‘major jumps’ in winning percentage to calculate the correlation. This will highlight a fanbase’s reliance on success or failure to dictate their attendance behaviours. My cut off is anything inbetween the zscore of -0.5
hist(jump.frame$z.win.diff)
jump.frame.big <- subset(jump.frame, z.win.diff < -.5 | z.win.diff.2 > .5)
nrow(jump.frame.big)/3067*100
## [1] 53.66808
Check for outliers - it looks like I found a mistake (I have corected this mistake earlier in the script). Also remove any teams where they have 6 or less records (not enough to prove the correlation has actual strength).
boxplot(jump.frame.big$z.win.diff)
boxplot(jump.frame.big$att.diff.control)
count.jump.frame.big <- count(jump.frame.big, 'Team')
big.list <- subset(count.jump.frame.big, freq <7, select = Team)
big.list <- big.list$Team
team.df.2 <- as.data.frame(table(jump.frame.big$Team))
team.df.2 <- team.df.2[1]
names(team.df.2)[1] <- "Team"
leaguechecking <- subset(majorfour, select = c('Team', 'League', 'City'))
leaguechecking <- unique(leaguechecking[c("Team", "League", "City")])
team.df.2 <- merge(team.df.2, leaguechecking, all.x = T)
team.df.2 <- subset(team.df.2, !(team.df.2$Team %in% big.list))
Calculate the correlation between 1 year percent difference in winning percentage and 1 year percent difference in attendance. I have also decided to convert any negative correlations to zero. For the way I am looking at this correlation (trying to find fairweather fans), a negative correlation shows the fanbase has no reliance on winning to dictate their attendance. You could argue that the fans show up more when the team loses but I’d say that is not the case.
cor.1by1.big <- function (x) {
cor(jump.frame.big$z.win.diff[jump.frame.big$Team == x], jump.frame.big$att.diff.control[jump.frame.big$Team == x], use = "na.or.complete", method = "spearman")
}
team.df.2$cor.1by1.big <- lapply(team.df.2$Team, cor.1by1.big)
team.df.2$cor.1by1.big <- as.numeric(team.df.2$cor.1by1.big)
Cor.Team <- team.df.2[,c(1,4)]
Cor.Team <- Cor.Team[order(-Cor.Team$cor.1by1.big),]
Cor.Team[1:25,]
## Team cor.1by1.big
## 37 Denver Broncos 0.8225070
## 110 San Francisco Giants 0.8214286
## 25 Chicago White Sox 0.8181818
## 23 Chicago Bulls 0.8153846
## 75 Montreal Expos 0.8095238
## 9 Baltimore Ravens 0.7937200
## 6 Atlanta Hawks 0.7714286
## 40 Detroit Pistons 0.7623346
## 97 Philadelphia Phillies 0.7524757
## 112 Seattle Mariners 0.7472527
## 125 Toronto Raptors 0.7417582
## 68 Milwaukee Brewers 0.7363636
## 105 Sacramento Kings 0.7302967
## 14 Buffalo Bills 0.7174763
## 102 Pittsburgh Steelers 0.7173536
## 71 Minnesota Twins 0.7132867
## 3 Arizona Diamondbacks 0.7062937
## 131 Washington Redskins 0.7062711
## 132 Washington Wizards 0.6947368
## 126 Utah Jazz 0.6923077
## 101 Pittsburgh Pirates 0.6879289
## 29 Cleveland Cavaliers 0.6490653
## 60 Los Angeles Kings 0.6484848
## 42 Detroit Tigers 0.6484255
## 48 Houston Astros 0.6483516
###Converting negative to zero
jump.with.neg <- team.df.2
team.df.2[team.df.2$cor.1by1.big < 0,4] <- 0
demo.jump.frame.big <- aggregate(jump.frame.big[c(12,17,19:87)], by = list(jump.frame.big$Team), mean, na.rm = T)
team.df.2 <- merge(team.df.2, demo.jump.frame.big, by.x = "Team", by.y = "Group.1", all.x = T)
Now that I have found which fanbases show the highest correlation between winning and attendance, I will now see if there are any demographic variables that show significance when predicting a highly correlated fanbase.
To begin, I will remove any NA records as well as select the predictor variables that I believe have an actual chance at predicting my outcome:
team.df.1 <- team.df.2[complete.cases(team.df.2),]
set.seed(100)
coreset <- team.df.1[,c(4:10,30,41:75)]
Split my data into training and test:
train_index <- createDataPartition(coreset$cor.1by1.big, times = 1, p = 0.7, list = F)
trainset <- coreset[train_index,]
testset <- coreset[-train_index,]
train_cor <- trainset$cor.1by1.big
test_cor <- testset$cor.1by1.big
testset<-testset[,-1]
Check for highly correlated variables and remove those that are highly correlated and may come from similar sources (Median Age instead of Male or Female median age, zscore of stadium capacity to population instead of percentage of stadium capacity to population OR percentage of attendance to population OR z score of attednance to population, payroll instead of winning percentage/payroll):
cor.test <- cor(trainset, use = "na.or.complete")
cor.test.melt <- arrange(melt(cor.test), -abs(value))
trainset.stripped <- trainset[,-c(6,7,36:38,42)]
Next I want to calculate the predictive power of each individual variable. I tried to create a function to perform this but ran into an unstoppable error so decided to do it manually.
every.lm.df <- data.frame(colnames(trainset.stripped[,-1]))
every.lm.df$pvalue <- NA
every.lm.df$rsquared <- NA
names(every.lm.df)[1] <- "variable"
###tried to build a function but did not work
fit1 <- lm(cor.1by1.big~adj.ticket.price, data = trainset.stripped)
fit2 <- lm(cor.1by1.big~Pop, data = trainset.stripped)
fit3 <- lm(cor.1by1.big~m.per.f.100, data = trainset.stripped)
fit4 <- lm(cor.1by1.big~Median.Age, data = trainset.stripped)
fit5 <- lm(cor.1by1.big~adj.median.income, data = trainset.stripped)
fit6 <- lm(cor.1by1.big~race.white, data = trainset.stripped)
fit7 <- lm(cor.1by1.big~race.black, data = trainset.stripped)
fit8 <- lm(cor.1by1.big~race.asian, data = trainset.stripped)
fit9 <- lm(cor.1by1.big~race.latin, data = trainset.stripped)
fit10 <- lm(cor.1by1.big~race.indigenous, data = trainset.stripped)
fit11 <- lm(cor.1by1.big~race.other, data = trainset.stripped)
fit12 <- lm(cor.1by1.big~race.mixed, data = trainset.stripped)
fit13 <- lm(cor.1by1.big~i.agri, data = trainset.stripped)
fit14 <- lm(cor.1by1.big~i.mining, data = trainset.stripped)
fit15 <- lm(cor.1by1.big~i.utilities, data = trainset.stripped)
fit16 <- lm(cor.1by1.big~i.construction, data = trainset.stripped)
fit17 <- lm(cor.1by1.big~i.manufacturing, data = trainset.stripped)
fit18 <- lm(cor.1by1.big~i.wholesale, data = trainset.stripped)
fit19 <- lm(cor.1by1.big~i.retail, data = trainset.stripped)
fit20 <- lm(cor.1by1.big~i.transportation, data = trainset.stripped)
fit21 <- lm(cor.1by1.big~i.info.culture, data = trainset.stripped)
fit22 <- lm(cor.1by1.big~i.finance.insur, data = trainset.stripped)
fit23 <- lm(cor.1by1.big~i.realestate, data = trainset.stripped)
fit24 <- lm(cor.1by1.big~i.sci.technical, data = trainset.stripped)
fit25 <- lm(cor.1by1.big~i.company.management, data = trainset.stripped)
fit26 <- lm(cor.1by1.big~i.admin.waste, data = trainset.stripped)
fit27 <- lm(cor.1by1.big~i.education, data = trainset.stripped)
fit28 <- lm(cor.1by1.big~i.health, data = trainset.stripped)
fit29 <- lm(cor.1by1.big~i.arts.entertain, data = trainset.stripped)
fit30 <- lm(cor.1by1.big~i.food.accommo, data = trainset.stripped)
fit31 <- lm(cor.1by1.big~i.other, data = trainset.stripped)
fit32 <- lm(cor.1by1.big~i.pub.admin, data = trainset.stripped)
fit33 <- lm(cor.1by1.big~z.pct.att.pop, data = trainset.stripped)
fit34 <- lm(cor.1by1.big~pct.tick.income, data = trainset.stripped)
fit35 <- lm(cor.1by1.big~payroll.pct, data = trainset.stripped)
fit36 <- lm(cor.1by1.big~tickprice.diff.1, data = trainset.stripped)
every.lm.df[1,2] <- anova(fit1)$'Pr(>F)'[1]
every.lm.df[2,2] <- anova(fit2)$'Pr(>F)'[1]
every.lm.df[3,2] <- anova(fit3)$'Pr(>F)'[1]
every.lm.df[4,2] <- anova(fit4)$'Pr(>F)'[1]
every.lm.df[5,2] <- anova(fit5)$'Pr(>F)'[1]
every.lm.df[6,2] <- anova(fit6)$'Pr(>F)'[1]
every.lm.df[7,2] <- anova(fit7)$'Pr(>F)'[1]
every.lm.df[8,2] <- anova(fit8)$'Pr(>F)'[1]
every.lm.df[9,2] <- anova(fit9)$'Pr(>F)'[1]
every.lm.df[10,2] <- anova(fit10)$'Pr(>F)'[1]
every.lm.df[11,2] <- anova(fit11)$'Pr(>F)'[1]
every.lm.df[12,2] <- anova(fit12)$'Pr(>F)'[1]
every.lm.df[13,2] <- anova(fit13)$'Pr(>F)'[1]
every.lm.df[14,2] <- anova(fit14)$'Pr(>F)'[1]
every.lm.df[15,2] <- anova(fit15)$'Pr(>F)'[1]
every.lm.df[16,2] <- anova(fit16)$'Pr(>F)'[1]
every.lm.df[17,2] <- anova(fit17)$'Pr(>F)'[1]
every.lm.df[18,2] <- anova(fit18)$'Pr(>F)'[1]
every.lm.df[19,2] <- anova(fit19)$'Pr(>F)'[1]
every.lm.df[20,2] <- anova(fit20)$'Pr(>F)'[1]
every.lm.df[21,2] <- anova(fit21)$'Pr(>F)'[1]
every.lm.df[22,2] <- anova(fit22)$'Pr(>F)'[1]
every.lm.df[23,2] <- anova(fit23)$'Pr(>F)'[1]
every.lm.df[24,2] <- anova(fit24)$'Pr(>F)'[1]
every.lm.df[25,2] <- anova(fit25)$'Pr(>F)'[1]
every.lm.df[26,2] <- anova(fit26)$'Pr(>F)'[1]
every.lm.df[27,2] <- anova(fit27)$'Pr(>F)'[1]
every.lm.df[28,2] <- anova(fit28)$'Pr(>F)'[1]
every.lm.df[29,2] <- anova(fit29)$'Pr(>F)'[1]
every.lm.df[30,2] <- anova(fit30)$'Pr(>F)'[1]
every.lm.df[31,2] <- anova(fit31)$'Pr(>F)'[1]
every.lm.df[32,2] <- anova(fit32)$'Pr(>F)'[1]
every.lm.df[33,2] <- anova(fit33)$'Pr(>F)'[1]
every.lm.df[34,2] <- anova(fit34)$'Pr(>F)'[1]
every.lm.df[35,2] <- anova(fit35)$'Pr(>F)'[1]
every.lm.df[36,2] <- anova(fit36)$'Pr(>F)'[1]
every.lm.df[1,3] <- summary(fit1)$r.squared
every.lm.df[2,3] <- summary(fit2)$r.squared
every.lm.df[3,3] <- summary(fit3)$r.squared
every.lm.df[4,3] <- summary(fit4)$r.squared
every.lm.df[5,3] <- summary(fit5)$r.squared
every.lm.df[6,3] <- summary(fit6)$r.squared
every.lm.df[7,3] <- summary(fit7)$r.squared
every.lm.df[8,3] <- summary(fit8)$r.squared
every.lm.df[9,3] <- summary(fit9)$r.squared
every.lm.df[10,3] <- summary(fit10)$r.squared
every.lm.df[11,3] <- summary(fit11)$r.squared
every.lm.df[12,3] <- summary(fit12)$r.squared
every.lm.df[13,3] <- summary(fit13)$r.squared
every.lm.df[14,3] <- summary(fit14)$r.squared
every.lm.df[15,3] <- summary(fit15)$r.squared
every.lm.df[16,3] <- summary(fit16)$r.squared
every.lm.df[17,3] <- summary(fit17)$r.squared
every.lm.df[18,3] <- summary(fit18)$r.squared
every.lm.df[19,3] <- summary(fit19)$r.squared
every.lm.df[20,3] <- summary(fit20)$r.squared
every.lm.df[21,3] <- summary(fit21)$r.squared
every.lm.df[22,3] <- summary(fit22)$r.squared
every.lm.df[23,3] <- summary(fit23)$r.squared
every.lm.df[24,3] <- summary(fit24)$r.squared
every.lm.df[25,3] <- summary(fit25)$r.squared
every.lm.df[26,3] <- summary(fit26)$r.squared
every.lm.df[27,3] <- summary(fit27)$r.squared
every.lm.df[28,3] <- summary(fit28)$r.squared
every.lm.df[29,3] <- summary(fit29)$r.squared
every.lm.df[30,3] <- summary(fit30)$r.squared
every.lm.df[31,3] <- summary(fit31)$r.squared
every.lm.df[32,3] <-summary(fit32)$r.squared
every.lm.df[33,3] <- summary(fit33)$r.squared
every.lm.df[34,3] <- summary(fit34)$r.squared
every.lm.df[35,3] <- summary(fit35)$r.squared
every.lm.df[36,3] <- summary(fit36)$r.squared
every.lm.df
## variable pvalue rsquared
## 1 adj.ticket.price 0.021415679 6.857330e-02
## 2 Pop 0.997585660 1.229001e-07
## 3 m.per.f.100 0.308788477 1.380741e-02
## 4 Median.Age 0.248751455 1.769735e-02
## 5 adj.median.income 0.594349295 3.799423e-03
## 6 race.white 0.034999801 5.791858e-02
## 7 race.black 0.079271655 4.049459e-02
## 8 race.asian 0.899033501 2.160675e-04
## 9 race.latin 0.046091955 5.199573e-02
## 10 race.indigenous 0.272217101 1.605103e-02
## 11 race.other 0.932261346 9.697310e-05
## 12 race.mixed 0.547682974 4.840266e-03
## 13 i.agri 0.920576719 1.334316e-04
## 14 i.mining 0.431710654 8.263655e-03
## 15 i.utilities 0.009622175 8.604490e-02
## 16 i.construction 0.122715154 3.147033e-02
## 17 i.manufacturing 0.150681855 2.734084e-02
## 18 i.wholesale 0.392560711 9.763767e-03
## 19 i.retail 0.296353347 1.453096e-02
## 20 i.transportation 0.992649420 1.139224e-06
## 21 i.info.culture 0.797652386 8.819269e-04
## 22 i.finance.insur 0.381409197 1.023097e-02
## 23 i.realestate 0.200625595 2.174345e-02
## 24 i.sci.technical 0.631987973 3.074044e-03
## 25 i.company.management 0.963841819 2.758408e-05
## 26 i.admin.waste 0.638914245 2.950801e-03
## 27 i.education 0.874746587 3.334660e-04
## 28 i.health 0.291008542 1.485388e-02
## 29 i.arts.entertain 0.474049379 6.855693e-03
## 30 i.food.accommo 0.319584185 1.320887e-02
## 31 i.other 0.201109952 2.169724e-02
## 32 i.pub.admin 0.627171216 3.161577e-03
## 33 z.pct.att.pop 0.352958892 1.151395e-02
## 34 pct.tick.income 0.048927385 5.071755e-02
## 35 payroll.pct 0.953123133 4.638324e-05
## 36 tickprice.diff.1 0.729162430 1.607824e-03
As you see, none of the variables show any predictive power by themselves. Let’s see if we can fit a working model based on multiple variables:
full <- lm(cor.1by1.big~., data = trainset.stripped)
step <- stepAIC(full, direction = "both", trace = F)
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
## Final Model:
## cor.1by1.big ~ adj.ticket.price + adj.median.income + race.white +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.utilities + i.construction + i.wholesale + i.transportation +
## i.realestate + i.company.management + i.health + i.arts.entertain +
## z.pct.att.pop + payroll.pct + Pop
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 41 1.797100 -217.3376
## 2 - i.pub.admin 0 0.000000e+00 41 1.797100 -217.3376
## 3 - i.food.accommo 1 6.840518e-05 42 1.797169 -219.3347
## 4 - i.retail 1 6.662072e-05 43 1.797235 -221.3318
## 5 - race.asian 1 1.558693e-03 44 1.798794 -223.2651
## 6 - i.sci.technical 1 2.184344e-03 45 1.800978 -225.1716
## 7 - i.other 1 5.315784e-03 46 1.806294 -226.9447
## 8 - m.per.f.100 1 6.320710e-03 47 1.812615 -228.6757
## 9 - pct.tick.income 1 5.580834e-03 48 1.818196 -230.4390
## 10 - i.info.culture 1 5.490905e-03 49 1.823686 -232.2068
## 11 - i.finance.insur 1 1.152619e-02 50 1.835213 -233.7217
## 12 - i.manufacturing 1 3.501397e-03 51 1.838714 -235.5749
## 13 - tickprice.diff.1 1 1.614673e-02 52 1.854861 -236.9017
## 14 - i.education 1 2.433345e-02 53 1.879194 -237.8981
## 15 - race.black 1 1.601560e-02 54 1.895210 -239.2446
## 16 - i.mining 1 3.366936e-02 55 1.928879 -239.8887
## 17 - Pop 1 2.477971e-02 56 1.953659 -240.9058
## 18 - Median.Age 1 3.330656e-02 57 1.986965 -241.6042
## 19 - i.admin.waste 1 3.841679e-02 58 2.025382 -242.1296
## 20 + Pop 1 5.750113e-02 57 1.967881 -242.3473
summary(step)
##
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + adj.median.income +
## race.white + race.latin + race.indigenous + race.other +
## race.mixed + i.agri + i.utilities + i.construction + i.wholesale +
## i.transportation + i.realestate + i.company.management +
## i.health + i.arts.entertain + z.pct.att.pop + payroll.pct +
## Pop, data = trainset.stripped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30511 -0.10187 0.00887 0.13254 0.34517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.496e-01 6.499e-01 0.999 0.321782
## adj.ticket.price -6.597e-03 1.676e-03 -3.935 0.000228 ***
## adj.median.income -1.449e-05 4.785e-06 -3.029 0.003687 **
## race.white 7.505e-03 3.171e-03 2.366 0.021384 *
## race.latin -9.381e-03 3.168e-03 -2.961 0.004469 **
## race.indigenous -3.703e-01 8.402e-02 -4.407 4.69e-05 ***
## race.other 5.126e-02 9.362e-03 5.476 1.02e-06 ***
## race.mixed 5.327e-02 4.076e-02 1.307 0.196486
## i.agri 5.018e-01 2.894e-01 1.734 0.088299 .
## i.utilities 7.344e-01 1.679e-01 4.374 5.25e-05 ***
## i.construction -9.079e-02 2.483e-02 -3.657 0.000559 ***
## i.wholesale -1.758e-01 7.990e-02 -2.200 0.031873 *
## i.transportation 6.390e-02 2.804e-02 2.279 0.026421 *
## i.realestate 2.569e-01 9.767e-02 2.630 0.010952 *
## i.company.management 1.661e+00 6.002e-01 2.767 0.007617 **
## i.health -2.924e-02 1.804e-02 -1.621 0.110578
## i.arts.entertain -1.105e-01 4.492e-02 -2.460 0.016929 *
## z.pct.att.pop 5.615e-02 3.635e-02 1.545 0.127907
## payroll.pct 4.149e-03 1.222e-03 3.396 0.001252 **
## Pop -2.802e-08 2.171e-08 -1.291 0.202069
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1858 on 57 degrees of freedom
## Multiple R-squared: 0.5572, Adjusted R-squared: 0.4096
## F-statistic: 3.775 on 19 and 57 DF, p-value: 5.178e-05
step.for <- stepAIC(full, direction = "forward", trace = F)
step.for$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
## Final Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 41 1.7971 -217.3376
summary(step.for)
##
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 +
## Median.Age + adj.median.income + race.white + race.black +
## race.asian + race.latin + race.indigenous + race.other +
## race.mixed + i.agri + i.mining + i.utilities + i.construction +
## i.manufacturing + i.wholesale + i.retail + i.transportation +
## i.info.culture + i.finance.insur + i.realestate + i.sci.technical +
## i.company.management + i.admin.waste + i.education + i.health +
## i.arts.entertain + i.food.accommo + i.other + i.pub.admin +
## z.pct.att.pop + pct.tick.income + payroll.pct + tickprice.diff.1,
## data = trainset.stripped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38061 -0.12110 -0.00275 0.12472 0.32482
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.452e+00 1.290e+01 -0.345 0.7318
## adj.ticket.price -8.559e-03 6.098e-03 -1.403 0.1680
## Pop -4.209e-08 6.137e-08 -0.686 0.4967
## m.per.f.100 -8.911e-03 3.551e-02 -0.251 0.8031
## Median.Age 4.028e-02 7.441e-02 0.541 0.5912
## adj.median.income -1.404e-05 1.462e-05 -0.960 0.3425
## race.white 4.370e-02 1.348e-01 0.324 0.7475
## race.black 3.483e-02 1.328e-01 0.262 0.7945
## race.asian 2.738e-02 1.403e-01 0.195 0.8462
## race.latin -1.023e-02 8.699e-03 -1.176 0.2462
## race.indigenous -3.819e-01 1.929e-01 -1.980 0.0545 .
## race.other 8.903e-02 1.186e-01 0.751 0.4571
## race.mixed 1.332e-01 1.384e-01 0.963 0.3413
## i.agri 6.868e-01 4.665e-01 1.472 0.1486
## i.mining 9.595e-02 1.146e-01 0.837 0.4074
## i.utilities 6.647e-01 3.534e-01 1.881 0.0671 .
## i.construction -8.453e-02 6.511e-02 -1.298 0.2015
## i.manufacturing 2.683e-02 4.173e-02 0.643 0.5238
## i.wholesale -2.604e-01 1.469e-01 -1.772 0.0838 .
## i.retail -3.072e-03 7.626e-02 -0.040 0.9681
## i.transportation 9.827e-02 6.857e-02 1.433 0.1594
## i.info.culture 4.280e-02 8.757e-02 0.489 0.6276
## i.finance.insur 2.713e-02 4.645e-02 0.584 0.5624
## i.realestate 3.086e-01 2.743e-01 1.125 0.2672
## i.sci.technical 1.026e-02 4.637e-02 0.221 0.8260
## i.company.management 1.656e+00 1.282e+00 1.292 0.2036
## i.admin.waste 5.166e-02 1.047e-01 0.493 0.6244
## i.education 2.439e-02 5.278e-02 0.462 0.6464
## i.health -3.396e-02 3.275e-02 -1.037 0.3058
## i.arts.entertain -1.147e-01 1.107e-01 -1.036 0.3062
## i.food.accommo -2.456e-03 6.217e-02 -0.040 0.9687
## i.other -2.061e-02 1.045e-01 -0.197 0.8446
## i.pub.admin NA NA NA NA
## z.pct.att.pop 4.123e-02 8.268e-02 0.499 0.6207
## pct.tick.income 1.192e+00 3.158e+00 0.377 0.7079
## payroll.pct 4.102e-03 1.650e-03 2.486 0.0171 *
## tickprice.diff.1 4.702e-03 8.353e-03 0.563 0.5766
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2094 on 41 degrees of freedom
## Multiple R-squared: 0.5956, Adjusted R-squared: 0.2505
## F-statistic: 1.726 on 35 and 41 DF, p-value: 0.0469
step.back<- stepAIC(full, direction = "backward", trace = F)
step.back$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
## Final Model:
## cor.1by1.big ~ adj.ticket.price + adj.median.income + race.white +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.utilities + i.construction + i.wholesale + i.transportation +
## i.realestate + i.company.management + i.health + i.arts.entertain +
## z.pct.att.pop + payroll.pct
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 41 1.797100 -217.3376
## 2 - i.pub.admin 0 0.000000e+00 41 1.797100 -217.3376
## 3 - i.food.accommo 1 6.840518e-05 42 1.797169 -219.3347
## 4 - i.retail 1 6.662072e-05 43 1.797235 -221.3318
## 5 - race.asian 1 1.558693e-03 44 1.798794 -223.2651
## 6 - i.sci.technical 1 2.184344e-03 45 1.800978 -225.1716
## 7 - i.other 1 5.315784e-03 46 1.806294 -226.9447
## 8 - m.per.f.100 1 6.320710e-03 47 1.812615 -228.6757
## 9 - pct.tick.income 1 5.580834e-03 48 1.818196 -230.4390
## 10 - i.info.culture 1 5.490905e-03 49 1.823686 -232.2068
## 11 - i.finance.insur 1 1.152619e-02 50 1.835213 -233.7217
## 12 - i.manufacturing 1 3.501397e-03 51 1.838714 -235.5749
## 13 - tickprice.diff.1 1 1.614673e-02 52 1.854861 -236.9017
## 14 - i.education 1 2.433345e-02 53 1.879194 -237.8981
## 15 - race.black 1 1.601560e-02 54 1.895210 -239.2446
## 16 - i.mining 1 3.366936e-02 55 1.928879 -239.8887
## 17 - Pop 1 2.477971e-02 56 1.953659 -240.9058
## 18 - Median.Age 1 3.330656e-02 57 1.986965 -241.6042
## 19 - i.admin.waste 1 3.841679e-02 58 2.025382 -242.1296
summary(step.back)
##
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + adj.median.income +
## race.white + race.latin + race.indigenous + race.other +
## race.mixed + i.agri + i.utilities + i.construction + i.wholesale +
## i.transportation + i.realestate + i.company.management +
## i.health + i.arts.entertain + z.pct.att.pop + payroll.pct,
## data = trainset.stripped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34496 -0.10418 -0.01453 0.11595 0.39372
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.065e+00 5.678e-01 1.876 0.065750 .
## adj.ticket.price -6.738e-03 1.682e-03 -4.005 0.000179 ***
## adj.median.income -1.582e-05 4.700e-06 -3.366 0.001357 **
## race.white 7.526e-03 3.189e-03 2.360 0.021683 *
## race.latin -9.565e-03 3.183e-03 -3.005 0.003922 **
## race.indigenous -3.645e-01 8.438e-02 -4.320 6.18e-05 ***
## race.other 4.531e-02 8.192e-03 5.531 7.98e-07 ***
## race.mixed 6.981e-02 3.892e-02 1.794 0.078054 .
## i.agri 3.958e-01 2.790e-01 1.418 0.161465
## i.utilities 7.299e-01 1.688e-01 4.323 6.12e-05 ***
## i.construction -8.002e-02 2.352e-02 -3.402 0.001216 **
## i.wholesale -1.740e-01 8.034e-02 -2.166 0.034443 *
## i.transportation 5.256e-02 2.678e-02 1.963 0.054458 .
## i.realestate 1.777e-01 7.640e-02 2.326 0.023564 *
## i.company.management 1.480e+00 5.869e-01 2.521 0.014466 *
## i.health -4.164e-02 1.535e-02 -2.713 0.008773 **
## i.arts.entertain -1.073e-01 4.511e-02 -2.379 0.020670 *
## z.pct.att.pop 5.918e-02 3.648e-02 1.623 0.110115
## payroll.pct 4.114e-03 1.228e-03 3.349 0.001429 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1869 on 58 degrees of freedom
## Multiple R-squared: 0.5443, Adjusted R-squared: 0.4029
## F-statistic: 3.848 on 18 and 58 DF, p-value: 4.789e-05
While it looks like the backwards and stepwise regression models show some significance, I still have my concerns. The forward selection kept all the variables so I will not try using that model. Let’s test our backward and stepwise regression models on my test set.
performance <- as.data.frame(test_cor)
performance$step.back <- predict(step.back, testset)
performance$step <- predict(step, testset)
performance <- performance[order(performance$test_cor),]
plot(performance$test_cor)
lines(performance$step.back,type='l',col='blue')
lines(performance$step,type='l',col='red')
Yikes! That does not look like a well predicting model. Looks like it has overfit to the training set. I will try a new training set to test:
set.seed(66)
train_index <- createDataPartition(coreset$cor.1by1.big, times = 1, p = 0.7, list = F)
trainset <- coreset[train_index,]
testset <- coreset[-train_index,]
train_cor <- trainset$cor.1by1.big
test_cor <- testset$cor.1by1.big
testset<-testset[,-1]
cor.test <- cor(trainset, use = "na.or.complete")
cor.test.melt <- arrange(melt(cor.test), -abs(value))
trainset.stripped <- trainset[,-c(6,7,36:38,42)]
every.lm.df <- data.frame(colnames(trainset.stripped[,-1]))
every.lm.df$pvalue <- NA
every.lm.df$rsquared <- NA
names(every.lm.df)[1] <- "variable"
###tried to build a function but did not work
fit1 <- lm(cor.1by1.big~adj.ticket.price, data = trainset.stripped)
fit2 <- lm(cor.1by1.big~Pop, data = trainset.stripped)
fit3 <- lm(cor.1by1.big~m.per.f.100, data = trainset.stripped)
fit4 <- lm(cor.1by1.big~Median.Age, data = trainset.stripped)
fit5 <- lm(cor.1by1.big~adj.median.income, data = trainset.stripped)
fit6 <- lm(cor.1by1.big~race.white, data = trainset.stripped)
fit7 <- lm(cor.1by1.big~race.black, data = trainset.stripped)
fit8 <- lm(cor.1by1.big~race.asian, data = trainset.stripped)
fit9 <- lm(cor.1by1.big~race.latin, data = trainset.stripped)
fit10 <- lm(cor.1by1.big~race.indigenous, data = trainset.stripped)
fit11 <- lm(cor.1by1.big~race.other, data = trainset.stripped)
fit12 <- lm(cor.1by1.big~race.mixed, data = trainset.stripped)
fit13 <- lm(cor.1by1.big~i.agri, data = trainset.stripped)
fit14 <- lm(cor.1by1.big~i.mining, data = trainset.stripped)
fit15 <- lm(cor.1by1.big~i.utilities, data = trainset.stripped)
fit16 <- lm(cor.1by1.big~i.construction, data = trainset.stripped)
fit17 <- lm(cor.1by1.big~i.manufacturing, data = trainset.stripped)
fit18 <- lm(cor.1by1.big~i.wholesale, data = trainset.stripped)
fit19 <- lm(cor.1by1.big~i.retail, data = trainset.stripped)
fit20 <- lm(cor.1by1.big~i.transportation, data = trainset.stripped)
fit21 <- lm(cor.1by1.big~i.info.culture, data = trainset.stripped)
fit22 <- lm(cor.1by1.big~i.finance.insur, data = trainset.stripped)
fit23 <- lm(cor.1by1.big~i.realestate, data = trainset.stripped)
fit24 <- lm(cor.1by1.big~i.sci.technical, data = trainset.stripped)
fit25 <- lm(cor.1by1.big~i.company.management, data = trainset.stripped)
fit26 <- lm(cor.1by1.big~i.admin.waste, data = trainset.stripped)
fit27 <- lm(cor.1by1.big~i.education, data = trainset.stripped)
fit28 <- lm(cor.1by1.big~i.health, data = trainset.stripped)
fit29 <- lm(cor.1by1.big~i.arts.entertain, data = trainset.stripped)
fit30 <- lm(cor.1by1.big~i.food.accommo, data = trainset.stripped)
fit31 <- lm(cor.1by1.big~i.other, data = trainset.stripped)
fit32 <- lm(cor.1by1.big~i.pub.admin, data = trainset.stripped)
fit33 <- lm(cor.1by1.big~z.pct.att.pop, data = trainset.stripped)
fit34 <- lm(cor.1by1.big~pct.tick.income, data = trainset.stripped)
fit35 <- lm(cor.1by1.big~payroll.pct, data = trainset.stripped)
fit36 <- lm(cor.1by1.big~tickprice.diff.1, data = trainset.stripped)
every.lm.df[1,2] <- anova(fit1)$'Pr(>F)'[1]
every.lm.df[2,2] <- anova(fit2)$'Pr(>F)'[1]
every.lm.df[3,2] <- anova(fit3)$'Pr(>F)'[1]
every.lm.df[4,2] <- anova(fit4)$'Pr(>F)'[1]
every.lm.df[5,2] <- anova(fit5)$'Pr(>F)'[1]
every.lm.df[6,2] <- anova(fit6)$'Pr(>F)'[1]
every.lm.df[7,2] <- anova(fit7)$'Pr(>F)'[1]
every.lm.df[8,2] <- anova(fit8)$'Pr(>F)'[1]
every.lm.df[9,2] <- anova(fit9)$'Pr(>F)'[1]
every.lm.df[10,2] <- anova(fit10)$'Pr(>F)'[1]
every.lm.df[11,2] <- anova(fit11)$'Pr(>F)'[1]
every.lm.df[12,2] <- anova(fit12)$'Pr(>F)'[1]
every.lm.df[13,2] <- anova(fit13)$'Pr(>F)'[1]
every.lm.df[14,2] <- anova(fit14)$'Pr(>F)'[1]
every.lm.df[15,2] <- anova(fit15)$'Pr(>F)'[1]
every.lm.df[16,2] <- anova(fit16)$'Pr(>F)'[1]
every.lm.df[17,2] <- anova(fit17)$'Pr(>F)'[1]
every.lm.df[18,2] <- anova(fit18)$'Pr(>F)'[1]
every.lm.df[19,2] <- anova(fit19)$'Pr(>F)'[1]
every.lm.df[20,2] <- anova(fit20)$'Pr(>F)'[1]
every.lm.df[21,2] <- anova(fit21)$'Pr(>F)'[1]
every.lm.df[22,2] <- anova(fit22)$'Pr(>F)'[1]
every.lm.df[23,2] <- anova(fit23)$'Pr(>F)'[1]
every.lm.df[24,2] <- anova(fit24)$'Pr(>F)'[1]
every.lm.df[25,2] <- anova(fit25)$'Pr(>F)'[1]
every.lm.df[26,2] <- anova(fit26)$'Pr(>F)'[1]
every.lm.df[27,2] <- anova(fit27)$'Pr(>F)'[1]
every.lm.df[28,2] <- anova(fit28)$'Pr(>F)'[1]
every.lm.df[29,2] <- anova(fit29)$'Pr(>F)'[1]
every.lm.df[30,2] <- anova(fit30)$'Pr(>F)'[1]
every.lm.df[31,2] <- anova(fit31)$'Pr(>F)'[1]
every.lm.df[32,2] <- anova(fit32)$'Pr(>F)'[1]
every.lm.df[33,2] <- anova(fit33)$'Pr(>F)'[1]
every.lm.df[34,2] <- anova(fit34)$'Pr(>F)'[1]
every.lm.df[35,2] <- anova(fit35)$'Pr(>F)'[1]
every.lm.df[36,2] <- anova(fit36)$'Pr(>F)'[1]
every.lm.df[1,3] <- summary(fit1)$r.squared
every.lm.df[2,3] <- summary(fit2)$r.squared
every.lm.df[3,3] <- summary(fit3)$r.squared
every.lm.df[4,3] <- summary(fit4)$r.squared
every.lm.df[5,3] <- summary(fit5)$r.squared
every.lm.df[6,3] <- summary(fit6)$r.squared
every.lm.df[7,3] <- summary(fit7)$r.squared
every.lm.df[8,3] <- summary(fit8)$r.squared
every.lm.df[9,3] <- summary(fit9)$r.squared
every.lm.df[10,3] <- summary(fit10)$r.squared
every.lm.df[11,3] <- summary(fit11)$r.squared
every.lm.df[12,3] <- summary(fit12)$r.squared
every.lm.df[13,3] <- summary(fit13)$r.squared
every.lm.df[14,3] <- summary(fit14)$r.squared
every.lm.df[15,3] <- summary(fit15)$r.squared
every.lm.df[16,3] <- summary(fit16)$r.squared
every.lm.df[17,3] <- summary(fit17)$r.squared
every.lm.df[18,3] <- summary(fit18)$r.squared
every.lm.df[19,3] <- summary(fit19)$r.squared
every.lm.df[20,3] <- summary(fit20)$r.squared
every.lm.df[21,3] <- summary(fit21)$r.squared
every.lm.df[22,3] <- summary(fit22)$r.squared
every.lm.df[23,3] <- summary(fit23)$r.squared
every.lm.df[24,3] <- summary(fit24)$r.squared
every.lm.df[25,3] <- summary(fit25)$r.squared
every.lm.df[26,3] <- summary(fit26)$r.squared
every.lm.df[27,3] <- summary(fit27)$r.squared
every.lm.df[28,3] <- summary(fit28)$r.squared
every.lm.df[29,3] <- summary(fit29)$r.squared
every.lm.df[30,3] <- summary(fit30)$r.squared
every.lm.df[31,3] <- summary(fit31)$r.squared
every.lm.df[32,3] <-summary(fit32)$r.squared
every.lm.df[33,3] <- summary(fit33)$r.squared
every.lm.df[34,3] <- summary(fit34)$r.squared
every.lm.df[35,3] <- summary(fit35)$r.squared
every.lm.df[36,3] <- summary(fit36)$r.squared
full <- lm(cor.1by1.big~., data = trainset.stripped)
step <- stepAIC(full, direction = "both", trace = F)
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
## Final Model:
## cor.1by1.big ~ Pop + race.white + race.black + race.asian + race.indigenous +
## race.other + i.transportation + i.admin.waste + i.education +
## i.other + pct.tick.income + payroll.pct
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 41 2.142817 -203.7897
## 2 - i.pub.admin 0 0.000000e+00 41 2.142817 -203.7897
## 3 - race.mixed 1 8.698656e-05 42 2.142904 -205.7866
## 4 - adj.ticket.price 1 4.839874e-04 43 2.143388 -207.7692
## 5 - i.utilities 1 1.319939e-03 44 2.144708 -209.7218
## 6 - i.finance.insur 1 2.513822e-03 45 2.147221 -211.6316
## 7 - i.info.culture 1 4.937379e-03 46 2.152159 -213.4547
## 8 - i.realestate 1 1.324748e-02 47 2.165406 -214.9822
## 9 - i.health 1 2.011350e-02 48 2.185520 -216.2703
## 10 - i.food.accommo 1 1.308001e-02 49 2.198600 -217.8108
## 11 - race.latin 1 1.419612e-02 50 2.212796 -219.3152
## 12 - i.mining 1 2.950924e-02 51 2.242305 -220.2952
## 13 - tickprice.diff.1 1 2.782312e-02 52 2.270128 -221.3456
## 14 - z.pct.att.pop 1 4.543336e-02 53 2.315562 -221.8198
## 15 - Median.Age 1 4.422811e-02 54 2.359790 -222.3629
## 16 - i.company.management 1 1.699285e-02 55 2.376782 -223.8104
## 17 - adj.median.income 1 4.901062e-02 56 2.425793 -224.2388
## 18 - i.retail 1 3.501037e-02 57 2.460803 -225.1354
## 19 - i.manufacturing 1 4.638659e-02 58 2.507190 -225.6975
## 20 - i.arts.entertain 1 1.861312e-02 59 2.525803 -227.1280
## 21 - i.construction 1 1.590265e-02 60 2.541706 -228.6447
## 22 - i.sci.technical 1 1.631322e-02 61 2.558019 -230.1521
## 23 - m.per.f.100 1 3.405628e-02 62 2.592075 -231.1337
## 24 - i.agri 1 4.710081e-02 63 2.639176 -231.7471
## 25 - i.wholesale 1 3.196753e-02 64 2.671144 -232.8200
summary(step)
##
## Call:
## lm(formula = cor.1by1.big ~ Pop + race.white + race.black + race.asian +
## race.indigenous + race.other + i.transportation + i.admin.waste +
## i.education + i.other + pct.tick.income + payroll.pct, data = trainset.stripped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49401 -0.13496 0.01773 0.13799 0.38065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.648e+01 4.460e+00 3.694 0.000459 ***
## Pop -7.575e-08 2.035e-08 -3.722 0.000419 ***
## race.white -1.575e-01 4.398e-02 -3.582 0.000659 ***
## race.black -1.424e-01 4.258e-02 -3.344 0.001384 **
## race.asian -1.757e-01 4.827e-02 -3.639 0.000548 ***
## race.indigenous -1.175e-01 7.672e-02 -1.532 0.130418
## race.other -1.266e-01 4.345e-02 -2.914 0.004909 **
## i.transportation 4.262e-02 2.465e-02 1.729 0.088644 .
## i.admin.waste -1.370e-01 4.919e-02 -2.784 0.007046 **
## i.education -3.814e-02 1.960e-02 -1.946 0.056077 .
## i.other -7.303e-02 2.783e-02 -2.624 0.010852 *
## pct.tick.income -1.878e+00 7.188e-01 -2.612 0.011189 *
## payroll.pct 1.806e-03 1.204e-03 1.500 0.138408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2043 on 64 degrees of freedom
## Multiple R-squared: 0.4353, Adjusted R-squared: 0.3294
## F-statistic: 4.111 on 12 and 64 DF, p-value: 9.588e-05
step.for <- stepAIC(full, direction = "forward", trace = F)
step.for$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
## Final Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 41 2.142817 -203.7897
summary(step.for)
##
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 +
## Median.Age + adj.median.income + race.white + race.black +
## race.asian + race.latin + race.indigenous + race.other +
## race.mixed + i.agri + i.mining + i.utilities + i.construction +
## i.manufacturing + i.wholesale + i.retail + i.transportation +
## i.info.culture + i.finance.insur + i.realestate + i.sci.technical +
## i.company.management + i.admin.waste + i.education + i.health +
## i.arts.entertain + i.food.accommo + i.other + i.pub.admin +
## z.pct.att.pop + pct.tick.income + payroll.pct + tickprice.diff.1,
## data = trainset.stripped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44126 -0.11706 0.01704 0.10725 0.38607
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.669e+01 1.304e+01 1.279 0.2079
## adj.ticket.price 9.994e-04 9.691e-03 0.103 0.9184
## Pop -1.530e-07 7.441e-08 -2.056 0.0462 *
## m.per.f.100 2.075e-02 4.408e-02 0.471 0.6403
## Median.Age 4.790e-02 7.280e-02 0.658 0.5142
## adj.median.income 1.703e-05 2.201e-05 0.774 0.4436
## race.white -1.815e-01 1.355e-01 -1.339 0.1878
## race.black -1.575e-01 1.331e-01 -1.183 0.2436
## race.asian -2.289e-01 1.501e-01 -1.525 0.1349
## race.latin 1.010e-02 1.237e-02 0.817 0.4189
## race.indigenous -3.275e-01 2.903e-01 -1.128 0.2658
## race.other -1.197e-01 1.249e-01 -0.958 0.3436
## race.mixed 6.303e-03 1.545e-01 0.041 0.9677
## i.agri 4.503e-01 7.793e-01 0.578 0.5665
## i.mining 1.522e-01 2.142e-01 0.710 0.4815
## i.utilities -7.970e-02 6.682e-01 -0.119 0.9056
## i.construction -1.200e-01 9.228e-02 -1.300 0.2008
## i.manufacturing -3.453e-02 6.010e-02 -0.575 0.5688
## i.wholesale -3.711e-01 2.074e-01 -1.790 0.0809 .
## i.retail 1.264e-01 8.144e-02 1.552 0.1282
## i.transportation 5.291e-02 6.984e-02 0.758 0.4530
## i.info.culture 2.780e-02 9.274e-02 0.300 0.7659
## i.finance.insur 1.365e-02 5.557e-02 0.246 0.8072
## i.realestate 1.443e-01 3.097e-01 0.466 0.6438
## i.sci.technical -8.981e-02 5.961e-02 -1.507 0.1396
## i.company.management 1.339e+00 1.462e+00 0.916 0.3651
## i.admin.waste -2.614e-01 1.389e-01 -1.882 0.0669 .
## i.education -6.396e-02 7.001e-02 -0.914 0.3663
## i.health -2.188e-02 4.724e-02 -0.463 0.6457
## i.arts.entertain -1.158e-01 1.489e-01 -0.777 0.4414
## i.food.accommo -4.025e-02 6.610e-02 -0.609 0.5459
## i.other -1.253e-01 1.207e-01 -1.038 0.3052
## i.pub.admin NA NA NA NA
## z.pct.att.pop 3.995e-02 6.741e-02 0.593 0.5567
## pct.tick.income -2.874e+00 4.918e+00 -0.584 0.5622
## payroll.pct 2.323e-03 1.808e-03 1.285 0.2060
## tickprice.diff.1 6.687e-03 7.755e-03 0.862 0.3936
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2286 on 41 degrees of freedom
## Multiple R-squared: 0.547, Adjusted R-squared: 0.1603
## F-statistic: 1.414 on 35 and 41 DF, p-value: 0.1425
step.back<- stepAIC(full, direction = "backward", trace = F)
step.back$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
## Final Model:
## cor.1by1.big ~ Pop + race.white + race.black + race.asian + race.indigenous +
## race.other + i.transportation + i.admin.waste + i.education +
## i.other + pct.tick.income + payroll.pct
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 41 2.142817 -203.7897
## 2 - i.pub.admin 0 0.000000e+00 41 2.142817 -203.7897
## 3 - race.mixed 1 8.698656e-05 42 2.142904 -205.7866
## 4 - adj.ticket.price 1 4.839874e-04 43 2.143388 -207.7692
## 5 - i.utilities 1 1.319939e-03 44 2.144708 -209.7218
## 6 - i.finance.insur 1 2.513822e-03 45 2.147221 -211.6316
## 7 - i.info.culture 1 4.937379e-03 46 2.152159 -213.4547
## 8 - i.realestate 1 1.324748e-02 47 2.165406 -214.9822
## 9 - i.health 1 2.011350e-02 48 2.185520 -216.2703
## 10 - i.food.accommo 1 1.308001e-02 49 2.198600 -217.8108
## 11 - race.latin 1 1.419612e-02 50 2.212796 -219.3152
## 12 - i.mining 1 2.950924e-02 51 2.242305 -220.2952
## 13 - tickprice.diff.1 1 2.782312e-02 52 2.270128 -221.3456
## 14 - z.pct.att.pop 1 4.543336e-02 53 2.315562 -221.8198
## 15 - Median.Age 1 4.422811e-02 54 2.359790 -222.3629
## 16 - i.company.management 1 1.699285e-02 55 2.376782 -223.8104
## 17 - adj.median.income 1 4.901062e-02 56 2.425793 -224.2388
## 18 - i.retail 1 3.501037e-02 57 2.460803 -225.1354
## 19 - i.manufacturing 1 4.638659e-02 58 2.507190 -225.6975
## 20 - i.arts.entertain 1 1.861312e-02 59 2.525803 -227.1280
## 21 - i.construction 1 1.590265e-02 60 2.541706 -228.6447
## 22 - i.sci.technical 1 1.631322e-02 61 2.558019 -230.1521
## 23 - m.per.f.100 1 3.405628e-02 62 2.592075 -231.1337
## 24 - i.agri 1 4.710081e-02 63 2.639176 -231.7471
## 25 - i.wholesale 1 3.196753e-02 64 2.671144 -232.8200
summary(step.back)
##
## Call:
## lm(formula = cor.1by1.big ~ Pop + race.white + race.black + race.asian +
## race.indigenous + race.other + i.transportation + i.admin.waste +
## i.education + i.other + pct.tick.income + payroll.pct, data = trainset.stripped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49401 -0.13496 0.01773 0.13799 0.38065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.648e+01 4.460e+00 3.694 0.000459 ***
## Pop -7.575e-08 2.035e-08 -3.722 0.000419 ***
## race.white -1.575e-01 4.398e-02 -3.582 0.000659 ***
## race.black -1.424e-01 4.258e-02 -3.344 0.001384 **
## race.asian -1.757e-01 4.827e-02 -3.639 0.000548 ***
## race.indigenous -1.175e-01 7.672e-02 -1.532 0.130418
## race.other -1.266e-01 4.345e-02 -2.914 0.004909 **
## i.transportation 4.262e-02 2.465e-02 1.729 0.088644 .
## i.admin.waste -1.370e-01 4.919e-02 -2.784 0.007046 **
## i.education -3.814e-02 1.960e-02 -1.946 0.056077 .
## i.other -7.303e-02 2.783e-02 -2.624 0.010852 *
## pct.tick.income -1.878e+00 7.188e-01 -2.612 0.011189 *
## payroll.pct 1.806e-03 1.204e-03 1.500 0.138408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2043 on 64 degrees of freedom
## Multiple R-squared: 0.4353, Adjusted R-squared: 0.3294
## F-statistic: 4.111 on 12 and 64 DF, p-value: 9.588e-05
performance <- as.data.frame(test_cor)
performance$step.back <- predict(step.back, testset)
performance$step <- predict(step, testset)
performance <- performance[order(performance$test_cor),]
plot(performance$test_cor)
lines(performance$step.back,type='l',col='blue')
lines(performance$step,type='l',col='red')
Looks like another failed model. Let’s try one more time:
set.seed(4)
train_index <- createDataPartition(coreset$cor.1by1.big, times = 1, p = 0.7, list = F)
trainset <- coreset[train_index,]
testset <- coreset[-train_index,]
train_cor <- trainset$cor.1by1.big
test_cor <- testset$cor.1by1.big
testset<-testset[,-1]
cor.test <- cor(trainset, use = "na.or.complete")
cor.test.melt <- arrange(melt(cor.test), -abs(value))
trainset.stripped <- trainset[,-c(6,7,36:38,42)]
every.lm.df <- data.frame(colnames(trainset.stripped[,-1]))
every.lm.df$pvalue <- NA
every.lm.df$rsquared <- NA
names(every.lm.df)[1] <- "variable"
###tried to build a function but did not work
fit1 <- lm(cor.1by1.big~adj.ticket.price, data = trainset.stripped)
fit2 <- lm(cor.1by1.big~Pop, data = trainset.stripped)
fit3 <- lm(cor.1by1.big~m.per.f.100, data = trainset.stripped)
fit4 <- lm(cor.1by1.big~Median.Age, data = trainset.stripped)
fit5 <- lm(cor.1by1.big~adj.median.income, data = trainset.stripped)
fit6 <- lm(cor.1by1.big~race.white, data = trainset.stripped)
fit7 <- lm(cor.1by1.big~race.black, data = trainset.stripped)
fit8 <- lm(cor.1by1.big~race.asian, data = trainset.stripped)
fit9 <- lm(cor.1by1.big~race.latin, data = trainset.stripped)
fit10 <- lm(cor.1by1.big~race.indigenous, data = trainset.stripped)
fit11 <- lm(cor.1by1.big~race.other, data = trainset.stripped)
fit12 <- lm(cor.1by1.big~race.mixed, data = trainset.stripped)
fit13 <- lm(cor.1by1.big~i.agri, data = trainset.stripped)
fit14 <- lm(cor.1by1.big~i.mining, data = trainset.stripped)
fit15 <- lm(cor.1by1.big~i.utilities, data = trainset.stripped)
fit16 <- lm(cor.1by1.big~i.construction, data = trainset.stripped)
fit17 <- lm(cor.1by1.big~i.manufacturing, data = trainset.stripped)
fit18 <- lm(cor.1by1.big~i.wholesale, data = trainset.stripped)
fit19 <- lm(cor.1by1.big~i.retail, data = trainset.stripped)
fit20 <- lm(cor.1by1.big~i.transportation, data = trainset.stripped)
fit21 <- lm(cor.1by1.big~i.info.culture, data = trainset.stripped)
fit22 <- lm(cor.1by1.big~i.finance.insur, data = trainset.stripped)
fit23 <- lm(cor.1by1.big~i.realestate, data = trainset.stripped)
fit24 <- lm(cor.1by1.big~i.sci.technical, data = trainset.stripped)
fit25 <- lm(cor.1by1.big~i.company.management, data = trainset.stripped)
fit26 <- lm(cor.1by1.big~i.admin.waste, data = trainset.stripped)
fit27 <- lm(cor.1by1.big~i.education, data = trainset.stripped)
fit28 <- lm(cor.1by1.big~i.health, data = trainset.stripped)
fit29 <- lm(cor.1by1.big~i.arts.entertain, data = trainset.stripped)
fit30 <- lm(cor.1by1.big~i.food.accommo, data = trainset.stripped)
fit31 <- lm(cor.1by1.big~i.other, data = trainset.stripped)
fit32 <- lm(cor.1by1.big~i.pub.admin, data = trainset.stripped)
fit33 <- lm(cor.1by1.big~z.pct.att.pop, data = trainset.stripped)
fit34 <- lm(cor.1by1.big~pct.tick.income, data = trainset.stripped)
fit35 <- lm(cor.1by1.big~payroll.pct, data = trainset.stripped)
fit36 <- lm(cor.1by1.big~tickprice.diff.1, data = trainset.stripped)
every.lm.df[1,2] <- anova(fit1)$'Pr(>F)'[1]
every.lm.df[2,2] <- anova(fit2)$'Pr(>F)'[1]
every.lm.df[3,2] <- anova(fit3)$'Pr(>F)'[1]
every.lm.df[4,2] <- anova(fit4)$'Pr(>F)'[1]
every.lm.df[5,2] <- anova(fit5)$'Pr(>F)'[1]
every.lm.df[6,2] <- anova(fit6)$'Pr(>F)'[1]
every.lm.df[7,2] <- anova(fit7)$'Pr(>F)'[1]
every.lm.df[8,2] <- anova(fit8)$'Pr(>F)'[1]
every.lm.df[9,2] <- anova(fit9)$'Pr(>F)'[1]
every.lm.df[10,2] <- anova(fit10)$'Pr(>F)'[1]
every.lm.df[11,2] <- anova(fit11)$'Pr(>F)'[1]
every.lm.df[12,2] <- anova(fit12)$'Pr(>F)'[1]
every.lm.df[13,2] <- anova(fit13)$'Pr(>F)'[1]
every.lm.df[14,2] <- anova(fit14)$'Pr(>F)'[1]
every.lm.df[15,2] <- anova(fit15)$'Pr(>F)'[1]
every.lm.df[16,2] <- anova(fit16)$'Pr(>F)'[1]
every.lm.df[17,2] <- anova(fit17)$'Pr(>F)'[1]
every.lm.df[18,2] <- anova(fit18)$'Pr(>F)'[1]
every.lm.df[19,2] <- anova(fit19)$'Pr(>F)'[1]
every.lm.df[20,2] <- anova(fit20)$'Pr(>F)'[1]
every.lm.df[21,2] <- anova(fit21)$'Pr(>F)'[1]
every.lm.df[22,2] <- anova(fit22)$'Pr(>F)'[1]
every.lm.df[23,2] <- anova(fit23)$'Pr(>F)'[1]
every.lm.df[24,2] <- anova(fit24)$'Pr(>F)'[1]
every.lm.df[25,2] <- anova(fit25)$'Pr(>F)'[1]
every.lm.df[26,2] <- anova(fit26)$'Pr(>F)'[1]
every.lm.df[27,2] <- anova(fit27)$'Pr(>F)'[1]
every.lm.df[28,2] <- anova(fit28)$'Pr(>F)'[1]
every.lm.df[29,2] <- anova(fit29)$'Pr(>F)'[1]
every.lm.df[30,2] <- anova(fit30)$'Pr(>F)'[1]
every.lm.df[31,2] <- anova(fit31)$'Pr(>F)'[1]
every.lm.df[32,2] <- anova(fit32)$'Pr(>F)'[1]
every.lm.df[33,2] <- anova(fit33)$'Pr(>F)'[1]
every.lm.df[34,2] <- anova(fit34)$'Pr(>F)'[1]
every.lm.df[35,2] <- anova(fit35)$'Pr(>F)'[1]
every.lm.df[36,2] <- anova(fit36)$'Pr(>F)'[1]
every.lm.df[1,3] <- summary(fit1)$r.squared
every.lm.df[2,3] <- summary(fit2)$r.squared
every.lm.df[3,3] <- summary(fit3)$r.squared
every.lm.df[4,3] <- summary(fit4)$r.squared
every.lm.df[5,3] <- summary(fit5)$r.squared
every.lm.df[6,3] <- summary(fit6)$r.squared
every.lm.df[7,3] <- summary(fit7)$r.squared
every.lm.df[8,3] <- summary(fit8)$r.squared
every.lm.df[9,3] <- summary(fit9)$r.squared
every.lm.df[10,3] <- summary(fit10)$r.squared
every.lm.df[11,3] <- summary(fit11)$r.squared
every.lm.df[12,3] <- summary(fit12)$r.squared
every.lm.df[13,3] <- summary(fit13)$r.squared
every.lm.df[14,3] <- summary(fit14)$r.squared
every.lm.df[15,3] <- summary(fit15)$r.squared
every.lm.df[16,3] <- summary(fit16)$r.squared
every.lm.df[17,3] <- summary(fit17)$r.squared
every.lm.df[18,3] <- summary(fit18)$r.squared
every.lm.df[19,3] <- summary(fit19)$r.squared
every.lm.df[20,3] <- summary(fit20)$r.squared
every.lm.df[21,3] <- summary(fit21)$r.squared
every.lm.df[22,3] <- summary(fit22)$r.squared
every.lm.df[23,3] <- summary(fit23)$r.squared
every.lm.df[24,3] <- summary(fit24)$r.squared
every.lm.df[25,3] <- summary(fit25)$r.squared
every.lm.df[26,3] <- summary(fit26)$r.squared
every.lm.df[27,3] <- summary(fit27)$r.squared
every.lm.df[28,3] <- summary(fit28)$r.squared
every.lm.df[29,3] <- summary(fit29)$r.squared
every.lm.df[30,3] <- summary(fit30)$r.squared
every.lm.df[31,3] <- summary(fit31)$r.squared
every.lm.df[32,3] <-summary(fit32)$r.squared
every.lm.df[33,3] <- summary(fit33)$r.squared
every.lm.df[34,3] <- summary(fit34)$r.squared
every.lm.df[35,3] <- summary(fit35)$r.squared
every.lm.df[36,3] <- summary(fit36)$r.squared
full <- lm(cor.1by1.big~., data = trainset.stripped)
step <- stepAIC(full, direction = "both", trace = F)
step$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
## Final Model:
## cor.1by1.big ~ adj.ticket.price + m.per.f.100 + Median.Age +
## race.white + race.black + race.asian + race.latin + race.indigenous +
## race.other + i.agri + i.mining + i.utilities + i.finance.insur +
## i.company.management + i.education + i.health + i.other +
## payroll.pct
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 41 2.520170 -191.2999
## 2 - i.pub.admin 0 0.000000e+00 41 2.520170 -191.2999
## 3 - i.sci.technical 1 4.341235e-07 42 2.520170 -193.2999
## 4 - i.retail 1 3.632728e-05 43 2.520207 -195.2988
## 5 - i.admin.waste 1 1.902793e-04 44 2.520397 -197.2930
## 6 - pct.tick.income 1 1.494157e-03 45 2.521891 -199.2473
## 7 - z.pct.att.pop 1 3.564715e-03 46 2.525456 -201.1386
## 8 - i.transportation 1 3.156890e-03 47 2.528613 -203.0424
## 9 - race.mixed 1 6.023364e-03 48 2.534636 -204.8592
## 10 - adj.median.income 1 5.333797e-03 49 2.539970 -206.6973
## 11 - i.arts.entertain 1 5.793448e-03 50 2.545763 -208.5219
## 12 - i.wholesale 1 7.659889e-03 51 2.553423 -210.2905
## 13 - i.info.culture 1 8.909748e-03 52 2.562333 -212.0223
## 14 - i.food.accommo 1 9.999666e-03 53 2.572333 -213.7224
## 15 - i.construction 1 6.807192e-03 54 2.579140 -215.5189
## 16 - tickprice.diff.1 1 2.555870e-02 55 2.604698 -216.7596
## 17 - Pop 1 4.380728e-02 56 2.648506 -217.4754
## 18 - i.realestate 1 2.822534e-02 57 2.676731 -218.6591
## 19 - i.manufacturing 1 5.342762e-02 58 2.730159 -219.1373
summary(step)
##
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + m.per.f.100 +
## Median.Age + race.white + race.black + race.asian + race.latin +
## race.indigenous + race.other + i.agri + i.mining + i.utilities +
## i.finance.insur + i.company.management + i.education + i.health +
## i.other + payroll.pct, data = trainset.stripped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52901 -0.08166 0.00237 0.10699 0.49528
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.585521 5.971909 2.945 0.00465 **
## adj.ticket.price -0.002946 0.001696 -1.737 0.08769 .
## m.per.f.100 -0.020357 0.015836 -1.286 0.20373
## Median.Age 0.062546 0.025919 2.413 0.01900 *
## race.white -0.171731 0.053615 -3.203 0.00221 **
## race.black -0.173062 0.053271 -3.249 0.00193 **
## race.asian -0.206767 0.059874 -3.453 0.00104 **
## race.latin -0.007565 0.003741 -2.022 0.04780 *
## race.indigenous -0.324256 0.117851 -2.751 0.00790 **
## race.other -0.126541 0.051563 -2.454 0.01715 *
## i.agri -0.898461 0.430207 -2.088 0.04116 *
## i.mining -0.211939 0.098620 -2.149 0.03582 *
## i.utilities 1.045356 0.310692 3.365 0.00136 **
## i.finance.insur -0.089918 0.034397 -2.614 0.01138 *
## i.company.management 2.638886 1.003342 2.630 0.01091 *
## i.education 0.055289 0.018359 3.012 0.00385 **
## i.health -0.059161 0.021129 -2.800 0.00693 **
## i.other -0.077979 0.048001 -1.625 0.10968
## payroll.pct 0.002614 0.001329 1.967 0.05398 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.217 on 58 degrees of freedom
## Multiple R-squared: 0.4241, Adjusted R-squared: 0.2454
## F-statistic: 2.373 on 18 and 58 DF, p-value: 0.006829
step.for <- stepAIC(full, direction = "forward", trace = F)
step.for$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
## Final Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 41 2.52017 -191.2999
summary(step.for)
##
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 +
## Median.Age + adj.median.income + race.white + race.black +
## race.asian + race.latin + race.indigenous + race.other +
## race.mixed + i.agri + i.mining + i.utilities + i.construction +
## i.manufacturing + i.wholesale + i.retail + i.transportation +
## i.info.culture + i.finance.insur + i.realestate + i.sci.technical +
## i.company.management + i.admin.waste + i.education + i.health +
## i.arts.entertain + i.food.accommo + i.other + i.pub.admin +
## z.pct.att.pop + pct.tick.income + payroll.pct + tickprice.diff.1,
## data = trainset.stripped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53887 -0.10530 -0.01021 0.11738 0.53065
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.093e+01 1.593e+01 0.686 0.4965
## adj.ticket.price -4.988e-03 9.921e-03 -0.503 0.6178
## Pop -3.465e-08 1.044e-07 -0.332 0.7417
## m.per.f.100 -1.411e-02 3.765e-02 -0.375 0.7098
## Median.Age 9.765e-02 8.292e-02 1.178 0.2457
## adj.median.income -3.933e-06 1.985e-05 -0.198 0.8439
## race.white -1.213e-01 1.552e-01 -0.782 0.4390
## race.black -1.215e-01 1.556e-01 -0.780 0.4396
## race.asian -1.543e-01 1.689e-01 -0.914 0.3661
## race.latin -7.574e-03 9.080e-03 -0.834 0.4091
## race.indigenous -3.572e-01 2.794e-01 -1.278 0.2084
## race.other -7.089e-02 1.556e-01 -0.456 0.6511
## race.mixed 4.532e-02 1.563e-01 0.290 0.7734
## i.agri -2.776e-01 1.264e+00 -0.220 0.8272
## i.mining -4.662e-02 3.037e-01 -0.154 0.8788
## i.utilities 6.471e-01 7.632e-01 0.848 0.4015
## i.construction -3.726e-02 1.236e-01 -0.301 0.7646
## i.manufacturing 2.712e-02 5.050e-02 0.537 0.5942
## i.wholesale -9.462e-02 2.246e-01 -0.421 0.6758
## i.retail -1.574e-03 7.032e-02 -0.022 0.9822
## i.transportation 1.352e-02 6.683e-02 0.202 0.8407
## i.info.culture -5.364e-02 9.452e-02 -0.568 0.5735
## i.finance.insur -2.098e-02 9.091e-02 -0.231 0.8186
## i.realestate 1.722e-01 4.365e-01 0.394 0.6953
## i.sci.technical -1.965e-04 7.396e-02 -0.003 0.9979
## i.company.management 1.768e+00 2.075e+00 0.852 0.3990
## i.admin.waste 6.900e-03 1.306e-01 0.053 0.9581
## i.education 5.215e-02 5.561e-02 0.938 0.3539
## i.health -5.432e-02 4.520e-02 -1.202 0.2363
## i.arts.entertain 2.506e-02 1.303e-01 0.192 0.8485
## i.food.accommo -4.774e-02 7.468e-02 -0.639 0.5262
## i.other -7.944e-02 1.126e-01 -0.706 0.4843
## i.pub.admin NA NA NA NA
## z.pct.att.pop 1.656e-02 8.265e-02 0.200 0.8422
## pct.tick.income 7.916e-01 5.063e+00 0.156 0.8765
## payroll.pct 3.426e-03 1.804e-03 1.899 0.0646 .
## tickprice.diff.1 4.652e-03 1.124e-02 0.414 0.6810
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2479 on 41 degrees of freedom
## Multiple R-squared: 0.4684, Adjusted R-squared: 0.01466
## F-statistic: 1.032 on 35 and 41 DF, p-value: 0.4579
step.back<- stepAIC(full, direction = "backward", trace = F)
step.back$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## cor.1by1.big ~ adj.ticket.price + Pop + m.per.f.100 + Median.Age +
## adj.median.income + race.white + race.black + race.asian +
## race.latin + race.indigenous + race.other + race.mixed +
## i.agri + i.mining + i.utilities + i.construction + i.manufacturing +
## i.wholesale + i.retail + i.transportation + i.info.culture +
## i.finance.insur + i.realestate + i.sci.technical + i.company.management +
## i.admin.waste + i.education + i.health + i.arts.entertain +
## i.food.accommo + i.other + i.pub.admin + z.pct.att.pop +
## pct.tick.income + payroll.pct + tickprice.diff.1
##
## Final Model:
## cor.1by1.big ~ adj.ticket.price + m.per.f.100 + Median.Age +
## race.white + race.black + race.asian + race.latin + race.indigenous +
## race.other + i.agri + i.mining + i.utilities + i.finance.insur +
## i.company.management + i.education + i.health + i.other +
## payroll.pct
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 41 2.520170 -191.2999
## 2 - i.pub.admin 0 0.000000e+00 41 2.520170 -191.2999
## 3 - i.sci.technical 1 4.341235e-07 42 2.520170 -193.2999
## 4 - i.retail 1 3.632728e-05 43 2.520207 -195.2988
## 5 - i.admin.waste 1 1.902793e-04 44 2.520397 -197.2930
## 6 - pct.tick.income 1 1.494157e-03 45 2.521891 -199.2473
## 7 - z.pct.att.pop 1 3.564715e-03 46 2.525456 -201.1386
## 8 - i.transportation 1 3.156890e-03 47 2.528613 -203.0424
## 9 - race.mixed 1 6.023364e-03 48 2.534636 -204.8592
## 10 - adj.median.income 1 5.333797e-03 49 2.539970 -206.6973
## 11 - i.arts.entertain 1 5.793448e-03 50 2.545763 -208.5219
## 12 - i.wholesale 1 7.659889e-03 51 2.553423 -210.2905
## 13 - i.info.culture 1 8.909748e-03 52 2.562333 -212.0223
## 14 - i.food.accommo 1 9.999666e-03 53 2.572333 -213.7224
## 15 - i.construction 1 6.807192e-03 54 2.579140 -215.5189
## 16 - tickprice.diff.1 1 2.555870e-02 55 2.604698 -216.7596
## 17 - Pop 1 4.380728e-02 56 2.648506 -217.4754
## 18 - i.realestate 1 2.822534e-02 57 2.676731 -218.6591
## 19 - i.manufacturing 1 5.342762e-02 58 2.730159 -219.1373
summary(step.back)
##
## Call:
## lm(formula = cor.1by1.big ~ adj.ticket.price + m.per.f.100 +
## Median.Age + race.white + race.black + race.asian + race.latin +
## race.indigenous + race.other + i.agri + i.mining + i.utilities +
## i.finance.insur + i.company.management + i.education + i.health +
## i.other + payroll.pct, data = trainset.stripped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52901 -0.08166 0.00237 0.10699 0.49528
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.585521 5.971909 2.945 0.00465 **
## adj.ticket.price -0.002946 0.001696 -1.737 0.08769 .
## m.per.f.100 -0.020357 0.015836 -1.286 0.20373
## Median.Age 0.062546 0.025919 2.413 0.01900 *
## race.white -0.171731 0.053615 -3.203 0.00221 **
## race.black -0.173062 0.053271 -3.249 0.00193 **
## race.asian -0.206767 0.059874 -3.453 0.00104 **
## race.latin -0.007565 0.003741 -2.022 0.04780 *
## race.indigenous -0.324256 0.117851 -2.751 0.00790 **
## race.other -0.126541 0.051563 -2.454 0.01715 *
## i.agri -0.898461 0.430207 -2.088 0.04116 *
## i.mining -0.211939 0.098620 -2.149 0.03582 *
## i.utilities 1.045356 0.310692 3.365 0.00136 **
## i.finance.insur -0.089918 0.034397 -2.614 0.01138 *
## i.company.management 2.638886 1.003342 2.630 0.01091 *
## i.education 0.055289 0.018359 3.012 0.00385 **
## i.health -0.059161 0.021129 -2.800 0.00693 **
## i.other -0.077979 0.048001 -1.625 0.10968
## payroll.pct 0.002614 0.001329 1.967 0.05398 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.217 on 58 degrees of freedom
## Multiple R-squared: 0.4241, Adjusted R-squared: 0.2454
## F-statistic: 2.373 on 18 and 58 DF, p-value: 0.006829
performance <- as.data.frame(test_cor)
performance$step.back <- predict(step.back, testset)
performance$step <- predict(step, testset)
performance <- performance[order(performance$test_cor),]
plot(performance$test_cor)
lines(performance$step.back,type='l',col='blue')
lines(performance$step,type='l',col='red')
Nothing. Looks like none of our variables can predict the fairweatherness of a fanbase at a significant level.