DATA 698 : Capstone Research Project

1 Overview
2 Capstone Project on CPI and Employment
3 Data Preparation
4 Employment vs CPI - by Industry and Area
5 RAJWANT WORK
- 5.1 Model
6 References

1 Overview

We are in totally different era of twenty first century, and it gives us very rare situation where any positive news would help the humankind. We want to use the historical CPI data and find the relation of it with the employment, in hope that we would have some positive news on employment by following the trend of the data in past.

We feel that lower CPI would result in more job opportunity, as the it gives space for more competition in small business across sectors.

2 Capstone Project on CPI and Employment

The estimates of employment for 1998-2006 are based on the 2002 North American Industry Classification System (NAICS). The estimates for 2007-2010 are based on the 2007 NAICS. The estimates for 2011-2016 are based on the 2012 NAICS. The estimates for 2017 forward are based on the 2017 NAICS.
Excludes limited partners.
Under the 2007 NAICS, internet publishing and broadcasting was reclassified to other information services.

(NA) Not available.
(NM) Not meaningful.
1. Not shown to avoid disclosure of confidential information; estimates are included in higher-level totals.
1. Estimate for employment suppressed to cover corresponding estimate for earnings. Estimates for this item are included in the total.

Last updated: September 24, 2019- new statistics for 2018; revised statistics for 2014-2017.

3 Data Preparation

Load the required libraries

#packages <- c("R.rsp","pandocfilters","knitr","plyr","tidyr","dplyr","ggplot2","plotly","sqldf","MASS","reshape2","Amelia","mice","googleVis","stringi","ROCR","scatterplot3d","visdat","gridExtra","sjmisc", "devtools")
#install.packages (packages, repos="http://cran.us.r-project.org", dependencies=TRUE, lib="C:/Program Files/R/R-3.6.3/library")
#invisible (lapply (packages, library, character.only = T))
#devtools::install_github("hadley/ggplot2")

#if (!require(devtools)) install.packages("devtools", repos="http://cran.us.r-project.org", dependencies=TRUE, lib="C:/Program Files/R/R-3.6.3/library")
#devtools::install_github("boxuancui/DataExplorer")
#library("DataExplorer", lib.loc="C:/Program Files/R/R-3.6.3/library")

library(tidyverse)
library(kableExtra)
library(lubridate)
library(forecast)
library(stringi)
library(dplyr)

3.1 Load Employment Datasets

file_path <- "../data/cpi/SAINC/"

EMP_DATA_FILE <- paste0(file_path,"EMP9818.csv")
EMP_DATA <- read_delim(EMP_DATA_FILE, delim = ",", col_names = TRUE, trim_ws= TRUE) 
#,skip = 6,col_types = list(col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character(),col_character()))
# names(SAP_INV_DAT) = c("DROP","I_BILLDATE","I_ORG","I_DC","I_DOCCA","I_BILLTYPE","I_ORDER_REA","I_DIV","I_MATYPE","I_BILL_QTY","BLANK")
# SAP_INV_DAT<- SAP_INV_DAT[-which(is.na(SAP_INV_DAT$I_ORG)),] %>% .[-str_which(trimws(.$I_ORG),'SOrg.|-----'),c(-1,-11)]
# 3
# SAP_INV_DAT$I_BILLDATE <- date(parse_datetime(SAP_INV_DAT$I_BILLDATE, "%m/%d/%Y"))
# 
# head(SAP_INV_DAT)
summary(EMP_DATA)

##    GeoFIPS            GeoName              Region       TableName        
##  Length:7084        Length:7084        Min.   :1.000   Length:7084       
##  Class :character   Class :character   1st Qu.:3.000   Class :character  
##  Mode  :character   Mode  :character   Median :5.000   Mode  :character  
##                                        Mean   :4.475                     
##                                        3rd Qu.:6.000                     
##                                        Max.   :8.000                     
##                                        NA's   :122                       
##     LineCode      IndustryClassification Description            Unit          
##  Min.   :  10.0   Length:7084            Length:7084        Length:7084       
##  1st Qu.: 517.0   Class :character       Class :character   Class :character  
##  Median : 712.5   Mode  :character       Mode  :character   Mode  :character  
##  Mean   : 872.1                                                               
##  3rd Qu.:1103.0                                                               
##  Max.   :2012.0                                                               
##  NA's   :4                                                                    
##      1998               1999               2000               2001          
##  Length:7084        Length:7084        Length:7084        Length:7084       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      2002               2003               2004               2005          
##  Length:7084        Length:7084        Length:7084        Length:7084       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      2006               2007               2008               2009          
##  Length:7084        Length:7084        Length:7084        Length:7084       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      2010               2011               2012               2013          
##  Length:7084        Length:7084        Length:7084        Length:7084       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      2014               2015               2016               2017          
##  Length:7084        Length:7084        Length:7084        Length:7084       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      2018          
##  Length:7084       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

DT::datatable(EMP_DATA)

# Reading the Category and detail Desc for ref.
EMP_DEF_FILE <- paste0(file_path,"SAEMP25N__definition.xml")
IND_DESC <- XML::xmlToDataFrame(EMP_DEF_FILE)
names(IND_DESC) <- c("LineCode","Description")
glimpse(IND_DESC)

## Observations: 118
## Variables: 3
## $ LineCode    <fct> 10, 20, 40, 50, 60, 70, 80, 90, 100, 101, 102, 103, 200...
## $ Description <fct> "Total employment (number of jobs)", "Wage and salary e...
## $ NA          <fct> "A count of jobs, both full-time and part-time. It incl...

head(IND_DESC,5)

3.2 Show Relevant Employment Data

# MN-MAIN  | NN - RANGE  | NL - LAST |
EMAP_CAT_LAST <- EMP_DATA[,c(2,5,6,7)] %>% 
                  filter(GeoName=="United States") %>% 
                  mutate(ISM = ifelse(IndustryClassification=="...","M","N")) %>% 
                  separate(IndustryClassification, c("From", "To"))  %>% 
                  mutate(ISML = ifelse(is.na(To)==T,"L","N")) %>% 
                  filter(ISML %in% c("L","N")) %>%  
                  select(.,LineCode,Description)
#DT::datatable(EMAP_CAT_LAST)

EMP_DAT_CAT <- EMP_DATA[,c(2,5,9:29)] %>% 
                left_join(EMAP_CAT_LAST,.,by="LineCode")
glimpse(EMP_DAT_CAT)

## Observations: 7,080
## Variables: 24
## $ LineCode    <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,...
## $ Description <chr> "Total employment (number of jobs)", "Total employment ...
## $ GeoName     <chr> "United States", "Alabama", "Alaska", "Arizona", "Arkan...
## $ `1998`      <chr> "158481200", "2361892", "382166", "2616288", "1445536",...
## $ `1999`      <chr> "161531300", "2378217", "381307", "2695892", "1460374",...
## $ `2000`      <chr> "165370800", "2392225", "389734", "2801510", "1482449",...
## $ `2001`      <chr> "165522200", "2376053", "394565", "2829002", "1482587",...
## $ `2002`      <chr> "165095100", "2364828", "402187", "2847095", "1478929",...
## $ `2003`      <chr> "165921500", "2371430", "405621", "2917121", "1482035",...
## $ `2004`      <chr> "168839700", "2425649", "413864", "3041476", "1505095",...
## $ `2005`      <chr> "172338400", "2486833", "421419", "3219820", "1537680",...
## $ `2006`      <chr> "175868600", "2545556", "431320", "3375218", "1567682",...
## $ `2007`      <chr> "179543700", "2604078", "439825", "3465075", "1582858",...
## $ `2008`      <chr> "179213900", "2582591", "443538", "3402808", "1579283",...
## $ `2009`      <chr> "173636700", "2479507", "442447", "3228493", "1542944",...
## $ `2010`      <chr> "172901700", "2460298", "443904", "3181571", "1541272",...
## $ `2011`      <chr> "176091700", "2497933", "450364", "3239045", "1561948",...
## $ `2012`      <chr> "178979700", "2503678", "459222", "3295537", "1565142",...
## $ `2013`      <chr> "182325100", "2523338", "461110", "3371219", "1569249",...
## $ `2014`      <chr> "186233800", "2551872", "461327", "3448173", "1587414",...
## $ `2015`      <chr> "190315800", "2586885", "461767", "3548174", "1610779",...
## $ `2016`      <chr> "193371900", "2619154", "457371", "3646604", "1629237",...
## $ `2017`      <chr> "196825300", "2653968", "456799", "3751283", "1644432",...
## $ `2018`      <chr> "200746000", "2691517", "459178", "3859137", "1663188",...

head(EMP_DAT_CAT,130)

#DT::datatable(EMP_DAT_CAT)

temp_name <- names(EMP_DAT_CAT [,c(4:24)])
EMP_DAT_CAT_SAN <-  data.frame(lapply(EMP_DAT_CAT [,c(4:24)],function(x) {
                      gsub("(D)|(T)",NA,x)
                    })) 
names(EMP_DAT_CAT_SAN) = temp_name
#DT::datatable(EMP_DAT_CAT_SAN)

EMP_DAT_CAT_SAN <- cbind(EMP_DAT_CAT[,c(1:3)],EMP_DAT_CAT_SAN)
#DT::datatable(EMP_DAT_CAT_SAN)

3.2.1 Employment from Select Industry by Area

3.2.1.1 AK - Alaska

EMP_DAT_CAT_SAN_AK <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Alaska") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_AK %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.2 AZ - Arizona

EMP_DAT_CAT_SAN_AZ <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Arizona") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_AZ %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.3 CA - California

EMP_DAT_CAT_SAN_CA <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="California") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_CA %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.4 CO - Colorado

EMP_DAT_CAT_SAN_CO <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Colorado") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_CO %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.5 FL - Florida

EMP_DAT_CAT_SAN_FL <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Florida") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_FL %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.6 GA - Georgia

EMP_DAT_CAT_SAN_GA <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Georgia") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_GA %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.7 HI - Hawaii

EMP_DAT_CAT_SAN_HI <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Hawaii") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_HI %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.8 IL - Illinois

EMP_DAT_CAT_SAN_IL <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Illinois") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_IL %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.9 KS - Kansas

EMP_DAT_CAT_SAN_KS <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Kansas") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_KS %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.10 MA - Massachusetts

EMP_DAT_CAT_SAN_MA <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Massachusetts") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_MA %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.11 MI - Michigan

EMP_DAT_CAT_SAN_MI <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Michigan") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_MI %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.12 MN - Minnesota

EMP_DAT_CAT_SAN_MN <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Michigan") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_MN %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.13 MO - Missouri

EMP_DAT_CAT_SAN_MO <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Missouri") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_MO %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.14 NY - New York

EMP_DAT_CAT_SAN_NY <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="New York") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_NY %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.15 OH - Ohio

EMP_DAT_CAT_SAN_OH <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Ohio") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_OH %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.16 OR - Oregon

EMP_DAT_CAT_SAN_OR <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Oregon") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_OR %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.17 PA - Pennsylvania

EMP_DAT_CAT_SAN_PA <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Pennsylvania") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_PA %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.18 TX - Texas

EMP_DAT_CAT_SAN_TX <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Texas") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_TX %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.19 WA - Washington

EMP_DAT_CAT_SAN_WA <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Washington") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_WA %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.2.1.20 WI - Wisconsin

EMP_DAT_CAT_SAN_WI <- EMP_DAT_CAT_SAN %>% 
                        filter(LineCode %in% c(705,1100,800,1600,1500)) %>% 
                        filter(GeoName=="Wisconsin") %>% 
                        reshape2::melt(
                          id.vars = c("LineCode","Description","GeoName"), 
                          variable.name = "Year", 
                          value.name = "Number of jobs")

EMP_DAT_CAT_SAN_WI %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

3.3 Load CPI Data

file_path <- "../data/cpi/archive/"

## Read files named *.csv
filenames <- list.files(path=file_path,
                        pattern="*.csv",
                        full.names = TRUE)
#print(filenames)

##Create list of data frame names without the ".csv" part 
names <- gsub(pattern='\\.', 
              replacement='_', # remove the last 4 characters from filename (_clean)
              x=(sub(pattern = "(.*)\\..*$", replacement = "\\1", # remove extenson from filename (.csv)
                    basename(filenames) # get filename without the directory file path
                   )
                 )
              )
#print(names)

###Load all files
for(i in filenames){
  ## Create list of data frame names without the ".csv" part 
  name <- gsub(pattern='\\.', 
               replacement='_', # remove the last 4 characters from filename (_clean)
               x=(sub(pattern = "(.*)\\..*$", replacement = "\\1", # remove extenson from filename (.csv)
                      basename(i) # get filename without the directory file path
                     )
                 )
              )
   print(name)
   assign(x=name, value=data.frame(read.csv(i)))
   #DT::datatable(name)
}

## [1] "cu_area"
## [1] "cu_base"
## [1] "cu_data_0_Current"
## [1] "cu_data_1_AllItems"
## [1] "cu_data_10_OtherWest"
## [1] "cu_data_11_USFoodBeverage"
## [1] "cu_data_12_USHousing"
## [1] "cu_data_13_USApparel"
## [1] "cu_data_14_USTransportation"
## [1] "cu_data_15_USMedical"
## [1] "cu_data_16_USRecreation"
## [1] "cu_data_17_USEducationAndCommunication"
## [1] "cu_data_18_USOtherGoodsAndServices"
## [1] "cu_data_19_PopulationSize"
## [1] "cu_data_2_Summaries"
## [1] "cu_data_20_USCommoditiesServicesSpecial"
## [1] "cu_data_3_AsizeNorthEast"
## [1] "cu_data_4_AsizeNorthCentral"
## [1] "cu_data_5_AsizeSouth"
## [1] "cu_data_6_AsizeWest"
## [1] "cu_data_7_OtherNorthEast"
## [1] "cu_data_8_OtherNorthCentral"
## [1] "cu_data_9_OtherSouth"
## [1] "cu_footnote"
## [1] "cu_item"
## [1] "cu_period"
## [1] "cu_periodicity"
## [1] "cu_series"

3.4 Show CPI Reference Data

3.4.1 Area

3.4.2 Base

3.4.3 Item

3.4.4 Periodicity

3.4.5 Series

3.4.6 Period

3.5 Combined Reference Data

3.6 Show CPI Data from Select Industry

3.6.1 CPI - US Food Beverages

FoodItemCode <- cu_reference_data %>%
                    filter(grepl('SAF|SEF|SS0|SS1|SS20', item_code))
                    #dplyr::filter(item_code %like% 'SAH' | item_code %like% 'SEH')
#DT::datatable((FoodItemCode))

cu_data_USFoodBeverage = cu_data_0_Current %>% 
                              inner_join(FoodItemCode, by="series_id") %>% 
                              inner_join(cu_period, by="period") %>% 
                               filter(year %in% (1998:2018)) %>% 
                               select(c("series_id",
                                        "series_title",
                                        "year",
                                        "period",
                                        "period_abbr",
                                        "period_name",
                                        "value",
                                        "area_code",
                                        "area_name",
                                        "area_display_level",
                                        "area_selectable",
                                        "area_sort_sequence",
                                        "item_code",
                                        "item_name",
                                        "item_display_level",
                                        "item_selectable",
                                        "item_sort_sequence",
                                        "seasonal",
                                        "periodicity_code",
                                        "periodicity_name",
                                        "base_code",
                                        "base_name",
                                        "base_period"))
DT::datatable(head(cu_data_USFoodBeverage))

3.6.2 CPI - US Housing

HousingItemCode <- cu_reference_data %>%
                    #filter(grepl('SAH|SEH', item_code, fixed = TRUE))
                    filter(grepl('SAH|SEH', item_code))
                    #dplyr::filter(item_code %like% 'SAH' | item_code %like% 'SEH')
#DT::datatable((HousingItemCode))

cu_data_USHousing = cu_data_0_Current %>% 
                              inner_join(HousingItemCode, by="series_id") %>% 
                              inner_join(cu_period, by="period") %>% 
                               #filter(item_code %like% c('SAH','SEH')) %>%
                               filter(year %in% (1998:2018)) %>% 
                               select(c("series_id",
                                        "series_title",
                                        "year",
                                        "period",
                                        "period_abbr",
                                        "period_name",
                                        "value",
                                        "area_code",
                                        "area_name",
                                        "area_display_level",
                                        "area_selectable",
                                        "area_sort_sequence",
                                        "item_code",
                                        "item_name",
                                        "item_display_level",
                                        "item_selectable",
                                        "item_sort_sequence",
                                        "seasonal",
                                        "periodicity_code",
                                        "periodicity_name",
                                        "base_code",
                                        "base_name",
                                        "base_period"))
DT::datatable(head(cu_data_USHousing))

3.6.3 CPI - US Transportation

TransportItemCode <- cu_reference_data %>%
                    filter(grepl('SAT|SET|SS4|SS52|SS53|SAS24|SAS4', item_code))
                    #dplyr::filter(item_code %like% 'SAH' | item_code %like% 'SEH')
#DT::datatable((TransportItemCode))

cu_data_USTransportation = cu_data_0_Current %>% 
                              inner_join(TransportItemCode, by="series_id") %>% 
                              inner_join(cu_period, by="period") %>% 
                               filter(year %in% (1998:2018)) %>% 
                               select(c("series_id",
                                        "series_title",
                                        "year",
                                        "period",
                                        "period_abbr",
                                        "period_name",
                                        "value",
                                        "area_code",
                                        "area_name",
                                        "area_display_level",
                                        "area_selectable",
                                        "area_sort_sequence",
                                        "item_code",
                                        "item_name",
                                        "item_display_level",
                                        "item_selectable",
                                        "item_sort_sequence",
                                        "seasonal",
                                        "periodicity_code",
                                        "periodicity_name",
                                        "base_code",
                                        "base_name",
                                        "base_period"))
DT::datatable(head(cu_data_USTransportation))

3.6.4 CPI - US Medical

MedicalItemCode <- cu_reference_data %>%
                    filter(grepl('SAM|SEM|SS57', item_code))
                    #dplyr::filter(item_code %like% 'SAH' | item_code %like% 'SEH')
#DT::datatable((MedicalItemCode))

cu_data_USMedical = cu_data_0_Current %>% 
                              inner_join(MedicalItemCode, by="series_id") %>% 
                              inner_join(cu_period, by="period") %>% 
                               filter(year %in% (1998:2018)) %>% 
                               select(c("series_id",
                                        "series_title",
                                        "year",
                                        "period",
                                        "period_abbr",
                                        "period_name",
                                        "value",
                                        "area_code",
                                        "area_name",
                                        "area_display_level",
                                        "area_selectable",
                                        "area_sort_sequence",
                                        "item_code",
                                        "item_name",
                                        "item_display_level",
                                        "item_selectable",
                                        "item_sort_sequence",
                                        "seasonal",
                                        "periodicity_code",
                                        "periodicity_name",
                                        "base_code",
                                        "base_name",
                                        "base_period"))
DT::datatable(head(cu_data_USMedical))

3.6.5 CPI - US Education And Communication

EducationItemCode <- cu_reference_data %>%
                    filter(grepl('SAE|SEE|SSE|SS27', item_code))
                    #dplyr::filter(item_code %like% 'SAH' | item_code %like% 'SEH')
#DT::datatable((EducationItemCode))

cu_data_USEducationAndCommunication = cu_data_0_Current %>% 
                                        inner_join(EducationItemCode, by="series_id") %>% 
                                        inner_join(cu_period, by="period") %>% 
                                         filter(year %in% (1998:2018)) %>% 
                                         select(c("series_id",
                                                  "series_title",
                                                  "year",
                                                  "period",
                                                  "period_abbr",
                                                  "period_name",
                                                  "value",
                                                  "area_code",
                                                  "area_name",
                                                  "area_display_level",
                                                  "area_selectable",
                                                  "area_sort_sequence",
                                                  "item_code",
                                                  "item_name",
                                                  "item_display_level",
                                                  "item_selectable",
                                                  "item_sort_sequence",
                                                  "seasonal",
                                                  "periodicity_code",
                                                  "periodicity_name",
                                                  "base_code",
                                                  "base_name",
                                                  "base_period"))
DT::datatable(head(cu_data_USEducationAndCommunication))

4 Employment vs CPI - by Industry and Area

4.1 Food & Beverages

4.1.1 AK - Alaska : Anchorage

cu_emp_data_USFoodBeverage_AK = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Anchorage", 
                                         period %in% c("S01","S02")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_AK %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_AK = EMP_DAT_CAT_SAN_AK %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cu_emp_data_USFoodBeverage_AK = merge(EMP_DAT_CAT_SAN_AK, cu_emp_data_USFoodBeverage_AK, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cu_emp_data_USFoodBeverage_AK %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.2 AZ - Arizona : Phoenix-Mesa

cu_emp_data_USFoodBeverage_AZ = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Phoenix-Mesa", 
                                         period %in% c("S01","S02")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_AK %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_AK = EMP_DAT_CAT_SAN_AK %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cu_emp_data_USFoodBeverage_AZ = merge(EMP_DAT_CAT_SAN_AZ, cu_emp_data_USFoodBeverage_AZ, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cu_emp_data_USFoodBeverage_AZ %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.3 CA - California

Los Angeles-Riverside-Orange County
San Francisco-Oakland-San Jose
San Diego

cu_emp_data_USFoodBeverage_CA = cu_data_USFoodBeverage %>% 
                                   filter( (area_name %in% c("Los Angeles-Riverside-Orange County", "San Francisco-Oakland-San Jose")
                                            & period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")
                                            )
                                           |
                                            (area_name %in% c("San Diego")
                                             & period %in% c("S01","S02")
                                             )
                                         ) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705,
                                         area_code = 'A421;A422;A424',
                                         area_name = 'Los Angeles-Riverside-Orange County;San Francisco-Oakland-San Jose;San Diego'
                                         ) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value/3),2))

#cu_emp_data_USFoodBeverage_CA %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_AK = EMP_DAT_CAT_SAN_AK %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cu_emp_data_USFoodBeverage_CA = merge(EMP_DAT_CAT_SAN_CA, cu_emp_data_USFoodBeverage_CA, 
                                      by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cu_emp_data_USFoodBeverage_CA %>%
  #sample_frac(0.33) %>% 
  DT::datatable()

4.1.4 CO - Colorado : Denver-Boulder-Greeley

cu_emp_data_USFoodBeverage_CO = cu_data_USFoodBeverage %>% 
                                  filter(area_name %in% c("Denver-Boulder-Greeley"), 
                                         period %in% c("S01","S02")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_AK %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_AK = EMP_DAT_CAT_SAN_AK %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cu_emp_data_USFoodBeverage_CO = merge(EMP_DAT_CAT_SAN_CO, cu_emp_data_USFoodBeverage_CO, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cu_emp_data_USFoodBeverage_CO %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.5 FL - Florida

Miami-Fort Lauderdale
Tampa-St. Petersburg-Clearwater

cu_emp_data_USFoodBeverage_FL = cu_data_USFoodBeverage %>% 
                                  filter( (area_name %in% c("Miami-Fort Lauderdale")
                                          & period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")
                                          )
                                         |
                                          (area_name %in% c("Tampa-St. Petersburg-Clearwater")
                                           & period %in% c("S01","S02")
                                           )
                                       ) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705,
                                         area_code = 'A320;A321',
                                         area_name = 'Miami-Fort Lauderdale;Tampa-St. Petersburg-Clearwater') %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value)/2,2))
#cu_emp_data_USFoodBeverage_FL %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_AK = EMP_DAT_CAT_SAN_AK %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cu_emp_data_USFoodBeverage_FL = merge(EMP_DAT_CAT_SAN_FL, cu_emp_data_USFoodBeverage_FL, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cu_emp_data_USFoodBeverage_FL %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.6 GA - Georgia

Atlanta

cu_emp_data_USFoodBeverage_GA = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Atlanta", 
                                         period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_MI = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_GA = merge(EMP_DAT_CAT_SAN_GA, cu_emp_data_USFoodBeverage_GA, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_GA %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.7 HI - Hawaii

Honolulu

cu_emp_data_USFoodBeverage_HI = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Honolulu", 
                                         period %in% c("S01","S02")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_MI = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_HI = merge(EMP_DAT_CAT_SAN_HI, cu_emp_data_USFoodBeverage_HI, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_HI %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.8 IL - Illinois

Chicago-Gary-Kenosha

cu_emp_data_USFoodBeverage_IL = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Chicago-Gary-Kenosha", 
                                         period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_MI = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_IL = merge(EMP_DAT_CAT_SAN_IL, cu_emp_data_USFoodBeverage_IL, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_IL %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.9 KS - Kansas

Kansas City

cu_emp_data_USFoodBeverage_KS = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Kansas City", 
                                         period %in% c("S01","S02")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_MI = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_KS = merge(EMP_DAT_CAT_SAN_KS, cu_emp_data_USFoodBeverage_KS, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_KS %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.10 MA - Massachusetts

Boston-Brockton-Nashua

cu_emp_data_USFoodBeverage_MA = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Boston-Brockton-Nashua", 
                                         period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_MI = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_MA = merge(EMP_DAT_CAT_SAN_MI, cu_emp_data_USFoodBeverage_MA, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_MA %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.11 MI - Michigan

Detroit-Ann Arbor-Flint

cu_emp_data_USFoodBeverage_MI = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Detroit-Ann Arbor-Flint", 
                                         period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_MI = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_MI = merge(EMP_DAT_CAT_SAN_MI, cu_emp_data_USFoodBeverage_MI, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.12 MN - Minnesota

Minneapolis-St. Paul

cu_emp_data_USFoodBeverage_MN = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Minneapolis-St. Paul", 
                                         period %in% c("S01","S02")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_NY = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_MN = merge(EMP_DAT_CAT_SAN_MN, cu_emp_data_USFoodBeverage_MN, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_MN %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.13 MO - Missouri

St. Louis

cu_emp_data_USFoodBeverage_MO = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="St. Louis", 
                                         period %in% c("S01","S02")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_NY = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_MO = merge(EMP_DAT_CAT_SAN_MO, cu_emp_data_USFoodBeverage_MO, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_MO %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.14 NY - New York

New York-Northern New Jersey-Long Island

cu_emp_data_USFoodBeverage_NY = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="New York-Northern New Jersey-Long Island", 
                                         period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USFoodBeverage_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_NY = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_NY = merge(EMP_DAT_CAT_SAN_NY, cu_emp_data_USFoodBeverage_NY, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_NY %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.15 OH - Ohio

Cleveland-Akron
Cincinnati-Hamilton

cu_emp_data_USFoodBeverage_OH = cu_data_USFoodBeverage %>% 
                                  filter( (area_name %in% c("Cleveland-Akron")
                                          & period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")
                                          )
                                         |
                                          (area_name %in% c("Cincinnati-Hamilton")
                                           & period %in% c("S01","S02")
                                           )
                                       ) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705,
                                         area_code = 'A210;A213', 
                                         area_name = 'Cleveland-Akron;Cincinnati-Hamilton'
                                         ) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value)/2,2))
#cu_emp_data_USFoodBeverage_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_NY = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_OH = merge(EMP_DAT_CAT_SAN_OH, cu_emp_data_USFoodBeverage_OH, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_OH %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.16 OR - Oregon

Portland-Salem

cu_emp_data_USFoodBeverage_OR = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Portland-Salem", 
                                         period %in% c("S01","S02")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))

#cu_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_MI = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_OR = merge(EMP_DAT_CAT_SAN_OR, cu_emp_data_USFoodBeverage_OR, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_OR %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.17 PA - Pennsylvania

Philadelphia-Wilmington-Atlantic City
Pittsburgh

cu_emp_data_USFoodBeverage_PA = cu_data_USFoodBeverage %>% 
                                  filter(area_name %in% c("Philadelphia-Wilmington-Atlantic City","Pittsburgh"), 
                                         period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705,
                                         area_code = 'A102;A104', 
                                         area_name = 'Philadelphia-Wilmington-Atlantic City;Pittsburgh') %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value)/2,2))

#cu_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_MI = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_PA = merge(EMP_DAT_CAT_SAN_PA, cu_emp_data_USFoodBeverage_PA, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_PA %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.18 TX - Texas

Dallas-Fort Worth
Houston-Galveston-Brazoria

cu_emp_data_USFoodBeverage_TX = cu_data_USFoodBeverage %>% 
                                  filter(area_name %in% c("Dallas-Fort Worth","Houston-Galveston-Brazoria"), 
                                         period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705,
                                         area_code = 'A316;A318', 
                                         area_name = 'Dallas-Fort Worth;Houston-Galveston-Brazoria'
                                         ) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value)/2,2))

#cu_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_MI = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_TX = merge(EMP_DAT_CAT_SAN_TX, cu_emp_data_USFoodBeverage_TX, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_TX %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.19 WA - Washington

Seattle-Tacoma-Bremerton

cu_emp_data_USFoodBeverage_WA = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Seattle-Tacoma-Bremerton", 
                                         period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))

#cu_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_MI = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_WA = merge(EMP_DAT_CAT_SAN_WA, cu_emp_data_USFoodBeverage_WA, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_WA %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.1.20 WI - Wisconsin

Milwaukee-Racine

cu_emp_data_USFoodBeverage_WI = cu_data_USFoodBeverage %>% 
                                  filter(area_name=="Milwaukee-Racine", 
                                         period %in% c("S01","S02")) %>% 
                                  mutate(parent_item_code = 'SAF', 
                                         emp_line_code = 705) %>% 
                                   select(c("parent_item_code",
                                            "emp_line_code",
                                            "series_id",
                                            "series_title",
                                            "year",
                                            "period",
                                            "period_abbr",
                                            "period_name",
                                            "value",
                                            "area_code",
                                            "area_name",
                                            "area_display_level",
                                            "area_selectable",
                                            "area_sort_sequence",
                                            "item_code",
                                            "item_name",
                                            "item_display_level",
                                            "item_selectable",
                                            "item_sort_sequence",
                                            "seasonal",
                                            "periodicity_code",
                                            "periodicity_name",
                                            "base_code",
                                            "base_name",
                                            "base_period")) %>% 
                                  group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                  summarise(cpi_value = round(sum(value),2))

#cu_emp_data_USFoodBeverage_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Food and beverage stores
#EMP_DAT_CAT_SAN_USFoodBeverage_MI = EMP_DAT_CAT_SAN_NY %>%
#                                      filter(LineCode==705)%>%
#                                      mutate(parent_line_code = 705)

cpi_emp_data_USFoodBeverage_WI = merge(EMP_DAT_CAT_SAN_WA, cu_emp_data_USFoodBeverage_WI, 
                                       by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USFoodBeverage_WI %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.2 Housing

4.2.1 NY - New York

cu_emp_data_USHousing_NY = cu_data_USHousing  %>% 
                            filter(area_name=="New York-Northern New Jersey-Long Island", 
                                   period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                            mutate(parent_item_code = 'SAH', 
                                   emp_line_code = 1100) %>% 
                            select(c("parent_item_code",
                                     "emp_line_code",
                                     "series_id",
                                     "series_title",
                                     "year",
                                     "period",
                                     "period_abbr",
                                     "period_name",
                                     "value",
                                     "area_code",
                                     "area_name",
                                     "area_display_level",
                                     "area_selectable",
                                     "area_sort_sequence",
                                     "item_code",
                                     "item_name",
                                     "item_display_level",
                                     "item_selectable",
                                     "item_sort_sequence",
                                     "seasonal",
                                     "periodicity_code",
                                     "periodicity_name",
                                     "base_code",
                                     "base_name",
                                     "base_period")) %>% 
                            group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                            summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USHousing_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Real estate and rental and leasing
#EMP_DAT_CAT_SAN_USHousing_NY = EMP_DAT_CAT_SAN_NY %>%
#                                  filter(LineCode %in% c(1100,1101,1102,1103)) %>%
#                                  mutate(parent_line_code = 1100)

cpi_emp_data_USHousing_NY = merge(EMP_DAT_CAT_SAN_NY, cu_emp_data_USHousing_NY, 
                                  by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USHousing_NY %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.2.2 MI - Michigan

cu_emp_data_USHousing_MI = cu_data_USHousing  %>% 
                            filter(area_name=="Detroit-Ann Arbor-Flint", 
                                   period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                            mutate(parent_item_code = 'SAH', 
                                   emp_line_code = 1100) %>% 
                            select(c("parent_item_code",
                                     "emp_line_code",
                                     "series_id",
                                     "series_title",
                                     "year",
                                     "period",
                                     "period_abbr",
                                     "period_name",
                                     "value",
                                     "area_code",
                                     "area_name",
                                     "area_display_level",
                                     "area_selectable",
                                     "area_sort_sequence",
                                     "item_code",
                                     "item_name",
                                     "item_display_level",
                                     "item_selectable",
                                     "item_sort_sequence",
                                     "seasonal",
                                     "periodicity_code",
                                     "periodicity_name",
                                     "base_code",
                                     "base_name",
                                     "base_period")) %>% 
                            group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                            summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USHousing_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Real estate and rental and leasing
#EMP_DAT_CAT_SAN_USHousing_MI = EMP_DAT_CAT_SAN_MI %>%
#                                  filter(LineCode %in% c(1100,1101,1102,1103)) %>%
#                                  mutate(parent_line_code = 1100)

cpi_emp_data_USHousing_MI = merge(EMP_DAT_CAT_SAN_MI, cu_emp_data_USHousing_MI, 
                                  by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USHousing_MI %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.2.3 OR - Oregon

cu_emp_data_USHousing_OR = cu_data_USHousing  %>% 
                            filter(area_name=="Portland-Salem", 
                                   period %in% c("S01","S02")) %>% 
                            mutate(parent_item_code = 'SAH', 
                                   emp_line_code = 1100) %>% 
                            select(c("parent_item_code",
                                     "emp_line_code",
                                     "series_id",
                                     "series_title",
                                     "year",
                                     "period",
                                     "period_abbr",
                                     "period_name",
                                     "value",
                                     "area_code",
                                     "area_name",
                                     "area_display_level",
                                     "area_selectable",
                                     "area_sort_sequence",
                                     "item_code",
                                     "item_name",
                                     "item_display_level",
                                     "item_selectable",
                                     "item_sort_sequence",
                                     "seasonal",
                                     "periodicity_code",
                                     "periodicity_name",
                                     "base_code",
                                     "base_name",
                                     "base_period")) %>% 
                            group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                            summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USHousing_MI %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Real estate and rental and leasing
#EMP_DAT_CAT_SAN_USHousing_MI = EMP_DAT_CAT_SAN_MI %>%
#                                  filter(LineCode %in% c(1100,1101,1102,1103)) %>%
#                                  mutate(parent_line_code = 1100)

cpi_emp_data_USHousing_OR = merge(EMP_DAT_CAT_SAN_OR, cu_emp_data_USHousing_OR, 
                                  by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USHousing_OR %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.3 Transportation

4.3.1 NY - New York

cu_emp_data_USTransportation_NY = cu_data_USTransportation %>% 
                                    filter(area_name=="New York-Northern New Jersey-Long Island", 
                                           period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                    mutate(parent_item_code = 'SAT', 
                                           emp_line_code = 800) %>% 
                                    select(c("parent_item_code",
                                             "emp_line_code",
                                             "series_id",
                                             "series_title",
                                             "year",
                                             "period",
                                             "period_abbr",
                                             "period_name",
                                             "value",
                                             "area_code",
                                             "area_name",
                                             "area_display_level",
                                             "area_selectable",
                                             "area_sort_sequence",
                                             "item_code",
                                             "item_name",
                                             "item_display_level",
                                             "item_selectable",
                                             "item_sort_sequence",
                                             "seasonal",
                                             "periodicity_code",
                                             "periodicity_name",
                                             "base_code",
                                             "base_name",
                                             "base_period")) %>% 
                                    group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                    summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USTransportation_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Transportation and warehousing
#EMP_DAT_CAT_SAN_USTransportation_NY = EMP_DAT_CAT_SAN_NY %>%
#                                        filter(LineCode %in% c(800,801,802,803,804,805,806,807,808,809,811)) %>%
#                                        mutate(parent_line_code = 800)

cpi_emp_data_USTransportation_NY = merge(EMP_DAT_CAT_SAN_NY, cu_emp_data_USTransportation_NY, 
                                         by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USTransportation_NY %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.3.2 MI - Michigan

cu_emp_data_USTransportation_MI = cu_data_USTransportation %>% 
                                    filter(area_name=="Detroit-Ann Arbor-Flint", 
                                           period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                    mutate(parent_item_code = 'SAT', 
                                           emp_line_code = 800) %>% 
                                    select(c("parent_item_code",
                                             "emp_line_code",
                                             "series_id",
                                             "series_title",
                                             "year",
                                             "period",
                                             "period_abbr",
                                             "period_name",
                                             "value",
                                             "area_code",
                                             "area_name",
                                             "area_display_level",
                                             "area_selectable",
                                             "area_sort_sequence",
                                             "item_code",
                                             "item_name",
                                             "item_display_level",
                                             "item_selectable",
                                             "item_sort_sequence",
                                             "seasonal",
                                             "periodicity_code",
                                             "periodicity_name",
                                             "base_code",
                                             "base_name",
                                             "base_period")) %>% 
                                    group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                    summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USTransportation_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Transportation and warehousing
#EMP_DAT_CAT_SAN_USTransportation_NY = EMP_DAT_CAT_SAN_NY %>%
#                                        filter(LineCode %in% c(800,801,802,803,804,805,806,807,808,809,811)) %>%
#                                        mutate(parent_line_code = 800)

cpi_emp_data_USTransportation_MI = merge(EMP_DAT_CAT_SAN_MI, cu_emp_data_USTransportation_MI, 
                                         by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USTransportation_MI %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.3.3 OR - Oregon

cu_emp_data_USTransportation_OR = cu_data_USTransportation %>% 
                                    filter(area_name=="Portland-Salem", 
                                           period %in% c("S01","S02")) %>% 
                                    mutate(parent_item_code = 'SAT', 
                                           emp_line_code = 800) %>% 
                                    select(c("parent_item_code",
                                             "emp_line_code",
                                             "series_id",
                                             "series_title",
                                             "year",
                                             "period",
                                             "period_abbr",
                                             "period_name",
                                             "value",
                                             "area_code",
                                             "area_name",
                                             "area_display_level",
                                             "area_selectable",
                                             "area_sort_sequence",
                                             "item_code",
                                             "item_name",
                                             "item_display_level",
                                             "item_selectable",
                                             "item_sort_sequence",
                                             "seasonal",
                                             "periodicity_code",
                                             "periodicity_name",
                                             "base_code",
                                             "base_name",
                                             "base_period")) %>% 
                                    group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                    summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USTransportation_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Transportation and warehousing
#EMP_DAT_CAT_SAN_USTransportation_NY = EMP_DAT_CAT_SAN_NY %>%
#                                        filter(LineCode %in% c(800,801,802,803,804,805,806,807,808,809,811)) %>%
#                                        mutate(parent_line_code = 800)

cpi_emp_data_USTransportation_OR = merge(EMP_DAT_CAT_SAN_OR, cu_emp_data_USTransportation_OR, 
                                         by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USTransportation_OR %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.4 Medical

4.4.1 NY - New York

cu_emp_data_USMedical_NY = cu_data_USMedical %>% 
                            filter(area_name=="New York-Northern New Jersey-Long Island", 
                                   period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                            mutate(parent_item_code = 'SAM', 
                                   emp_line_code = 1600) %>% 
                            select(c("parent_item_code",
                                     "emp_line_code",
                                     "series_id",
                                     "series_title",
                                     "year",
                                     "period",
                                     "period_abbr",
                                     "period_name",
                                     "value",
                                     "area_code",
                                     "area_name",
                                     "area_display_level",
                                     "area_selectable",
                                     "area_sort_sequence",
                                     "item_code",
                                     "item_name",
                                     "item_display_level",
                                     "item_selectable",
                                     "item_sort_sequence",
                                     "seasonal",
                                     "periodicity_code",
                                     "periodicity_name",
                                     "base_code",
                                     "base_name",
                                     "base_period")) %>% 
                            group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                            summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USMedical_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Health care and social assistance
#EMP_DAT_CAT_SAN_USTransportation_NY = EMP_DAT_CAT_SAN_NY %>%
#                                        filter(LineCode %in% c(1600,1601,1602,1603,1604)) %>%
#                                        mutate(parent_line_code = 1600)

cpi_emp_data_USMedical_NY = merge(EMP_DAT_CAT_SAN_NY, cu_emp_data_USMedical_NY, 
                                  by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USMedical_NY %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.4.2 MI - Michigan

cu_emp_data_USMedical_MI = cu_data_USMedical %>% 
                            filter(area_name=="Detroit-Ann Arbor-Flint", 
                                   period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                            mutate(parent_item_code = 'SAM', 
                                   emp_line_code = 1600) %>% 
                            select(c("parent_item_code",
                                     "emp_line_code",
                                     "series_id",
                                     "series_title",
                                     "year",
                                     "period",
                                     "period_abbr",
                                     "period_name",
                                     "value",
                                     "area_code",
                                     "area_name",
                                     "area_display_level",
                                     "area_selectable",
                                     "area_sort_sequence",
                                     "item_code",
                                     "item_name",
                                     "item_display_level",
                                     "item_selectable",
                                     "item_sort_sequence",
                                     "seasonal",
                                     "periodicity_code",
                                     "periodicity_name",
                                     "base_code",
                                     "base_name",
                                     "base_period")) %>% 
                            group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                            summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USMedical_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Health care and social assistance
#EMP_DAT_CAT_SAN_USTransportation_NY = EMP_DAT_CAT_SAN_NY %>%
#                                        filter(LineCode %in% c(1600,1601,1602,1603,1604)) %>%
#                                        mutate(parent_line_code = 1600)

cpi_emp_data_USMedical_MI = merge(EMP_DAT_CAT_SAN_MI, cu_emp_data_USMedical_MI, 
                                  by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USMedical_MI %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.4.3 OR - Oregon

cu_emp_data_USMedical_OR = cu_data_USMedical %>% 
                            filter(area_name=="Portland-Salem", 
                                   period %in% c("S01","S02")) %>% 
                            mutate(parent_item_code = 'SAM', 
                                   emp_line_code = 1600) %>% 
                            select(c("parent_item_code",
                                     "emp_line_code",
                                     "series_id",
                                     "series_title",
                                     "year",
                                     "period",
                                     "period_abbr",
                                     "period_name",
                                     "value",
                                     "area_code",
                                     "area_name",
                                     "area_display_level",
                                     "area_selectable",
                                     "area_sort_sequence",
                                     "item_code",
                                     "item_name",
                                     "item_display_level",
                                     "item_selectable",
                                     "item_sort_sequence",
                                     "seasonal",
                                     "periodicity_code",
                                     "periodicity_name",
                                     "base_code",
                                     "base_name",
                                     "base_period")) %>% 
                            group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                            summarise(cpi_value = round(sum(value),2))
#cu_emp_data_USMedical_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Health care and social assistance
#EMP_DAT_CAT_SAN_USTransportation_NY = EMP_DAT_CAT_SAN_NY %>%
#                                        filter(LineCode %in% c(1600,1601,1602,1603,1604)) %>%
#                                        mutate(parent_line_code = 1600)

cpi_emp_data_USMedical_OR = merge(EMP_DAT_CAT_SAN_OR, cu_emp_data_USMedical_OR, 
                                  by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USMedical_OR %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.5 Education & Communication

4.5.1 NY - New York

cu_data_USEducationAndCommunication_NY = cu_data_USEducationAndCommunication %>% 
                                          filter(area_name=="New York-Northern New Jersey-Long Island", 
                                                 period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                          mutate(parent_item_code = 'SAE', emp_line_code = 1500) %>% 
                                          select(c("parent_item_code",
                                                   "emp_line_code",
                                                   "series_id",
                                                   "series_title",
                                                   "year",
                                                   "period",
                                                   "period_abbr",
                                                   "period_name",
                                                   "value",
                                                   "area_code",
                                                   "area_name",
                                                   "area_display_level",
                                                   "area_selectable",
                                                   "area_sort_sequence",
                                                   "item_code",
                                                   "item_name",
                                                   "item_display_level",
                                                   "item_selectable",
                                                   "item_sort_sequence",
                                                   "seasonal",
                                                   "periodicity_code",
                                                   "periodicity_name",
                                                   "base_code",
                                                   "base_name",
                                                   "base_period")) %>% 
                                          group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                          summarise(cpi_value = round(sum(value),2))
#cu_data_USEducationAndCommunication_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Educational services
#EMP_DAT_CAT_SAN_USEducationAndCommunication_NY = EMP_DAT_CAT_SAN_NY %>%
#                                                   filter(LineCode %in% c(1500)) %>%
#                                                   mutate(parent_line_code = 1500)

cpi_emp_data_USEducationAndCommunication_NY = merge(EMP_DAT_CAT_SAN_NY, cu_data_USEducationAndCommunication_NY, 
                                                    by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USEducationAndCommunication_NY %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.5.2 MI - Michigan

cu_data_USEducationAndCommunication_MI = cu_data_USEducationAndCommunication %>% 
                                          filter(area_name=="Detroit-Ann Arbor-Flint", 
                                                 period %in% c("M01","M02","M03","M04","M05","M06","M07","M08","M09","M10","M11","M12")) %>% 
                                          mutate(parent_item_code = 'SAE', emp_line_code = 1500) %>% 
                                          select(c("parent_item_code",
                                                   "emp_line_code",
                                                   "series_id",
                                                   "series_title",
                                                   "year",
                                                   "period",
                                                   "period_abbr",
                                                   "period_name",
                                                   "value",
                                                   "area_code",
                                                   "area_name",
                                                   "area_display_level",
                                                   "area_selectable",
                                                   "area_sort_sequence",
                                                   "item_code",
                                                   "item_name",
                                                   "item_display_level",
                                                   "item_selectable",
                                                   "item_sort_sequence",
                                                   "seasonal",
                                                   "periodicity_code",
                                                   "periodicity_name",
                                                   "base_code",
                                                   "base_name",
                                                   "base_period")) %>% 
                                          group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                          summarise(cpi_value = round(sum(value),2))
#cu_data_USEducationAndCommunication_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Educational services
#EMP_DAT_CAT_SAN_USEducationAndCommunication_NY = EMP_DAT_CAT_SAN_NY %>%
#                                                   filter(LineCode %in% c(1500)) %>%
#                                                   mutate(parent_line_code = 1500)
                                      
                                                    

cpi_emp_data_USEducationAndCommunication_MI = merge(EMP_DAT_CAT_SAN_MI, cu_data_USEducationAndCommunication_MI, 
                                                    by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USEducationAndCommunication_MI %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

4.5.3 OR - Oregon

cu_data_USEducationAndCommunication_OR = cu_data_USEducationAndCommunication %>% 
                                          filter(area_name=="Portland-Salem", 
                                                 period %in% c("S01","S02")) %>% 
                                          mutate(parent_item_code = 'SAE', emp_line_code = 1500) %>% 
                                          select(c("parent_item_code",
                                                   "emp_line_code",
                                                   "series_id",
                                                   "series_title",
                                                   "year",
                                                   "period",
                                                   "period_abbr",
                                                   "period_name",
                                                   "value",
                                                   "area_code",
                                                   "area_name",
                                                   "area_display_level",
                                                   "area_selectable",
                                                   "area_sort_sequence",
                                                   "item_code",
                                                   "item_name",
                                                   "item_display_level",
                                                   "item_selectable",
                                                   "item_sort_sequence",
                                                   "seasonal",
                                                   "periodicity_code",
                                                   "periodicity_name",
                                                   "base_code",
                                                   "base_name",
                                                   "base_period")) %>% 
                                          group_by(parent_item_code, emp_line_code, year, area_code, area_name) %>% 
                                          summarise(cpi_value = round(sum(value),2))
#cu_data_USEducationAndCommunication_NY %>%
#  sample_frac(0.33) %>% 
#  DT::datatable()

#Educational services
#EMP_DAT_CAT_SAN_USEducationAndCommunication_NY = EMP_DAT_CAT_SAN_NY %>%
#                                                   filter(LineCode %in% c(1500)) %>%
#                                                   mutate(parent_line_code = 1500)

cpi_emp_data_USEducationAndCommunication_OR = merge(EMP_DAT_CAT_SAN_OR, cu_data_USEducationAndCommunication_OR, 
                                                    by.x=c("LineCode","Year"), by.y=c("emp_line_code","year"))

cpi_emp_data_USEducationAndCommunication_OR %>%
#  sample_frac(0.33) %>% 
  DT::datatable()

5 RAJWANT WORK

cpi_emp_data_USFoodBeverage_NY  %>% select(c("Year","Sector"= "Description","No_Jobs"="Number of jobs","cpi_value")) %>%   gather(key="Type", value ="Value", "cpi_value","No_Jobs") %>% ggplot(mapping = aes(x= Year, y=(as.numeric(Value)),fill = Type,group = Type)) + 
geom_col(position = position_dodge()) + geom_smooth(method = "lm")

## `geom_smooth()` using formula 'y ~ x'

   theme(axis.text.x = element_text(angle = 60, colour="gray",hjust = 1,size=rel(0.86)))+
  labs(title = "USSFoodBeverage Data for No. Of Jobs and CPI Value in NY",
       y = "Sector",x= "Years")

## List of 4
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "gray"
##   ..$ size         : 'rel' num 0.86
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : num 60
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ y          : chr "Sector"
##  $ x          : chr "Years"
##  $ title      : chr "USSFoodBeverage Data for No. Of Jobs and CPI Value in NY"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

cpi_emp_data_USFoodBeverage_NY  %>% select(c("Year","Sector"= "Description","No_Jobs"="Number of jobs")) %>% ggplot(mapping = aes(x= Year, y=(as.numeric(No_Jobs)))) + geom_col()+    theme(axis.text.x = element_text(angle = 60, colour="gray",hjust = 1,size=rel(0.86)))+
  labs(title = "USSFoodBeverage Data for No. Of Jobs  in NY",
       y = "Sector",x= "Years")

cpi_emp_data_USFoodBeverage_NY  %>% select(c("Year","Sector"= "Description","cpi_value")) %>% ggplot(mapping = aes(x= Year, y=(as.numeric(cpi_value)))) + geom_col()+    theme(axis.text.x = element_text(angle = 60, colour="gray",hjust = 1,size=rel(0.86)))+
  labs(title = "USSFoodBeverage Data for CPI Value in NY",
       y = "Sector",x= "Years")

# summary(as.numeric(cpi_emp_data_USFoodBeverage_NY$`Number of jobs`))
# summary(as.numeric(cpi_emp_data_USFoodBeverage_NY$cpi_value))
# 
# quantile(as.numeric(cpi_emp_data_USFoodBeverage_NY$`Number of jobs`))
# levels(quantile(as.numeric(cpi_emp_data_USFoodBeverage_NY$cpi_value)))

5.1 Model

#library(DataExplorer)
#library(plot_histogram)
library(lazytrade)

ALL_NY <- rbind(cpi_emp_data_USTransportation_NY,
      cpi_emp_data_USMedical_NY,
      cpi_emp_data_USHousing_NY,
      cpi_emp_data_USFoodBeverage_NY,
      cpi_emp_data_USEducationAndCommunication_NY)


#DataExplorer::plot_missing(ALL_NY, title = "Beverage Training Data: % Missing Values by Data Element")

ALL_NY$`Number of jobs` = as.numeric(ALL_NY$`Number of jobs`)
#plot_histogram(ALL_NY, ggtheme=theme_light())
#plot_correlation(ALL_NY[,c(2,5,9)])

ggplot(ALL_NY[,c(2,3,5,9)], mapping= aes(y= log(cpi_value), x=Year,color= Description, group = Description)) + geom_smooth() +
  theme_classic()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(ALL_NY[,c(2,3,5,9)], mapping= aes(y= log(`Number of jobs`), x=Year,color= Description, group = Description)) + geom_smooth() + theme_classic()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ALL_NY

ALL_NY_M <- ALL_NY[,c(2,5,6,9)]
names(ALL_NY_M) = c("Year", "Jobs", "C_Item","CPI")

# ALL_NY_M$Year = as.character(ALL_NY_M$Year)
# ALL_NY_M$E_Area = as.character(ALL_NY_M$E_Area)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

set.seed(3456)
trainIndex <- createDataPartition(ALL_NY_M$Jobs, p = .8, 
                                  list = FALSE, 
                                  times = 1)
ALL_NY_train <- ALL_NY_M[ trainIndex,]
ALL_NY_test  <- ALL_NY_M[-trainIndex,]

head(ALL_NY_M)

metric = 'RMSE'
head(ALL_NY_train )

# Train control

customTrainControl <- trainControl(method = "repeatedcv", 
                                   number = 10 , 
                                   repeats = 5 ,
                                   verboseIter = F)
# Linear Model

lm <- train(Jobs ~ .,
            ALL_NY_train,
            method= 'lm',
            trControl = customTrainControl)

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

lm$results

summary(lm)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -141982  -33811    -302   30145  148496 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.918e+05  3.142e+04   9.287 7.38e-13 ***
## Year1999    -3.829e+03  4.189e+04  -0.091 0.927494    
## Year2000     2.578e+04  4.174e+04   0.618 0.539326    
## Year2001     3.835e+04  3.933e+04   0.975 0.333891    
## Year2002     4.543e+04  3.933e+04   1.155 0.253104    
## Year2003     5.607e+04  4.239e+04   1.322 0.191478    
## Year2004     7.446e+04  3.987e+04   1.868 0.067154 .  
## Year2005     8.853e+04  4.303e+04   2.057 0.044417 *  
## Year2006     1.064e+05  4.349e+04   2.446 0.017670 *  
## Year2007     1.190e+05  4.792e+04   2.483 0.016089 *  
## Year2008     1.397e+05  4.615e+04   3.028 0.003742 ** 
## Year2009     1.395e+05  4.692e+04   2.973 0.004370 ** 
## Year2010     1.621e+05  4.642e+04   3.493 0.000952 ***
## Year2011     1.709e+05  4.532e+04   3.771 0.000399 ***
## Year2012     1.633e+05  4.924e+04   3.316 0.001620 ** 
## Year2013     1.869e+05  4.719e+04   3.961 0.000216 ***
## Year2014     1.982e+05  5.909e+04   3.354 0.001448 ** 
## Year2015     2.303e+05  4.427e+04   5.202 3.00e-06 ***
## Year2016     2.593e+05  5.744e+04   4.514 3.40e-05 ***
## Year2017     2.408e+05  4.215e+04   5.714 4.65e-07 ***
## Year2018            NA         NA      NA       NA    
## C_ItemSAF   -1.453e+05  3.882e+04  -3.742 0.000437 ***
## C_ItemSAH    1.903e+05  8.301e+04   2.293 0.025699 *  
## C_ItemSAM    1.043e+06  2.330e+04  44.749  < 2e-16 ***
## C_ItemSAT   -2.298e+04  4.981e+04  -0.461 0.646392    
## CPI         -3.298e+00  3.231e+00  -1.021 0.311804    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61960 on 55 degrees of freedom
## Multiple R-squared:  0.9866, Adjusted R-squared:  0.9807 
## F-statistic: 168.5 on 24 and 55 DF,  p-value: < 2.2e-16

par(mfrow=c(2,2))
plot(lm$finalModel)

#--------------
# https://stackoverflow.com/questions/44200195/how-to-debug-contrasts-can-be-applied-only-to-factors-with-2-or-more-levels-er
ALL_NY_train %>% dplyr::mutate_all(as.factor) %>% str

## 'data.frame':    80 obs. of  4 variables:
##  $ Year  : Factor w/ 21 levels "1998","1999",..: 1 3 4 5 7 11 12 13 14 15 ...
##  $ Jobs  : Factor w/ 79 levels "202970","205107",..: 19 23 25 18 20 28 24 21 26 29 ...
##  $ C_Item: Factor w/ 5 levels "SAE","SAF","SAH",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ CPI   : Factor w/ 80 levels "1196.3","1207.7",..: 34 42 40 36 48 60 56 59 66 68 ...

unique(ALL_NY$area_code)

## [1] A101
## 45 Levels: 0000 0100 0200 0300 0400 A000 A100 A101 A102 A103 A104 A200 ... X400

# multiple LM
lm_cpi_item <- lm(Jobs ~ CPI+C_Item,ALL_NY_train)
summary(lm_cpi_item)

## 
## Call:
## lm(formula = Jobs ~ CPI + C_Item, data = ALL_NY_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -238919  -41944   -5136   25157  299676 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.819e+05  2.274e+04  16.791  < 2e-16 ***
## CPI          7.314e+00  2.727e+00   2.682  0.00903 ** 
## C_ItemSAF   -2.541e+05  4.090e+04  -6.214 2.76e-08 ***
## C_ItemSAH   -7.161e+04  7.466e+04  -0.959  0.34063    
## C_ItemSAM    1.017e+06  3.135e+04  32.443  < 2e-16 ***
## C_ItemSAT   -1.626e+05  5.094e+04  -3.192  0.00207 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 87680 on 74 degrees of freedom
## Multiple R-squared:  0.9638, Adjusted R-squared:  0.9614 
## F-statistic: 394.5 on 5 and 74 DF,  p-value: < 2.2e-16

lm_cpi2_item <- lm(Jobs ~ (CPI^2)+C_Item,ALL_NY_train)
summary(lm_cpi2_item)

## 
## Call:
## lm(formula = Jobs ~ (CPI^2) + C_Item, data = ALL_NY_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -238919  -41944   -5136   25157  299676 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.819e+05  2.274e+04  16.791  < 2e-16 ***
## CPI          7.314e+00  2.727e+00   2.682  0.00903 ** 
## C_ItemSAF   -2.541e+05  4.090e+04  -6.214 2.76e-08 ***
## C_ItemSAH   -7.161e+04  7.466e+04  -0.959  0.34063    
## C_ItemSAM    1.017e+06  3.135e+04  32.443  < 2e-16 ***
## C_ItemSAT   -1.626e+05  5.094e+04  -3.192  0.00207 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 87680 on 74 degrees of freedom
## Multiple R-squared:  0.9638, Adjusted R-squared:  0.9614 
## F-statistic: 394.5 on 5 and 74 DF,  p-value: < 2.2e-16

lm_cpi_year <- lm(Jobs ~ CPI+Year,ALL_NY_train)
summary(lm_cpi_year)

## 
## Call:
## lm(formula = Jobs ~ CPI + Year, data = ALL_NY_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -543365 -328882 -178870  239324  874565 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 608217.21  219290.01   2.774  0.00741 **
## CPI            -15.04       5.88  -2.558  0.01310 * 
## Year1999     57003.61  319638.10   0.178  0.85907   
## Year2000    113213.91  320094.66   0.354  0.72483   
## Year2001     50576.44  301418.64   0.168  0.86732   
## Year2002     57639.07  301418.44   0.191  0.84901   
## Year2003    131628.51  319744.44   0.412  0.68207   
## Year2004    101161.63  301652.70   0.335  0.73854   
## Year2005    175316.88  319939.52   0.548  0.58578   
## Year2006    199568.55  320094.78   0.623  0.53538   
## Year2007     -8440.53  349762.90  -0.024  0.98083   
## Year2008    294192.97  323627.32   0.909  0.36702   
## Year2009    249079.79  348085.16   0.716  0.47708   
## Year2010    326193.37  321806.45   1.014  0.31490   
## Year2011    253681.42  304191.90   0.834  0.40767   
## Year2012     74205.25  325084.20   0.228  0.82023   
## Year2013    229539.76  320171.91   0.717  0.47625   
## Year2014    191238.92  409103.60   0.467  0.64189   
## Year2015    305188.84  303679.30   1.005  0.31901   
## Year2016    320336.72  409891.32   0.782  0.43762   
## Year2017    269002.74  320008.39   0.841  0.40396   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 476500 on 59 degrees of freedom
## Multiple R-squared:  0.1487, Adjusted R-squared:  -0.1399 
## F-statistic: 0.5153 on 20 and 59 DF,  p-value: 0.949

xyplot(ALL_NY_test$Jobs ~ predict(lm),
 ## plot the points (type = 'p') and a background grid ('g')
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Observed")

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

xyplot(ALL_NY_test$Jobs ~ predict(lm_cpi_year),
 ## plot the points (type = 'p') and a background grid ('g')
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Observed")

 xyplot(resid(lm_cpi_year) ~ predict(lm_cpi_year),
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Residuals")

  #PLS
 # Useing train perform  to perfrom  pre-process and tuning together. The function first preprocess the training set by centering it and scaling it. Then the function uses 10-fold cross validation to try the number of components, i.e. latent variables, of the PLS model from 1 to 20.
 # # The train function generates a resampling estimate of performance. Because
# the training set size is not small, 10-fold cross-validation should produce
# reasonable estimates of model performance. The function trainControl specifies
# the type of resampling:
ctrl <- trainControl(method = "cv", number = 10)

model_pls<- train(Jobs ~ .,   ALL_NY_train,
                     method = "pls",
                     tuneLength = 20,
                     metric='Rsquared',
                     trControl = ctrl,
                     preProc = c("center", "scale"))

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2006, Year2018

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

model_pls

## Partial Least Squares 
## 
## 80 samples
##  3 predictor
## 
## Pre-processing: centered (25), scaled (25) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 72, 72, 72, 72, 72, 72, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##    1     262509.32  0.6477474  217562.44
##    2     167818.17  0.8622355  140030.97
##    3      96337.03  0.9561960   81301.18
##    4      78353.72  0.9748132   63473.45
##    5      79356.89  0.9718313   64146.98
##    6      80180.07  0.9686154   66026.58
##    7      82402.21  0.9661310   68664.52
##    8      82283.96  0.9663358   68616.73
##    9      83158.41  0.9651061   69421.75
##   10      84395.59  0.9630032   71213.14
##   11      84342.06  0.9619838   71189.22
##   12      84362.30  0.9619403   71196.97
##   13      84362.44  0.9619400   71197.03
##   14      84362.44  0.9619400   71197.03
##   15      84362.44  0.9619400   71197.03
##   16      84362.44  0.9619400   71197.03
##   17      84362.44  0.9619400   71197.03
##   18      84362.44  0.9619400   71197.03
##   19      84362.44  0.9619400   71197.03
##   20      84362.44  0.9619400   71197.03
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 4.

model_pls$results[3,2:4]

# M

model_glm<- train(Jobs ~ .,   ALL_NY_train,
                     method = "glm",
                     tuneLength = 20,
                     metric='Rsquared',
                     trControl = ctrl,
                     preProc = c("center", "scale"))

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Year2018

model_glm

## Generalized Linear Model 
## 
## 80 samples
##  3 predictor
## 
## Pre-processing: centered (25), scaled (25) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 72, 72, 72, 72, 72, 72, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   75224.05  0.9751087  60425.63

test_model(test_dataset = model_pls,
           predictor_dataset = ALL_NY_train,
           test_type = "regression")


# TEST MODEL with Train  data
xyplot(ALL_NY_train$Jobs ~ predict(model_pls),
 ## plot the points (type = 'p') and a background grid ('g')
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Observed")


xyplot(ALL_NY_train$Jobs ~ predict(lm),
 ## plot the points (type = 'p') and a background grid ('g')
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Observed")

xyplot(ALL_NY_train$Jobs ~ predict(lm_cpi2_item),
 ## plot the points (type = 'p') and a background grid ('g')
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Observed")


xyplot(ALL_NY_train$Jobs ~ predict(lm_cpi_item),
 ## plot the points (type = 'p') and a background grid ('g')
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Observed")

xyplot(ALL_NY_train$Jobs ~ predict(model_glm),
 ## plot the points (type = 'p') and a background grid ('g')
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Observed")

# TEST MODEL with TEst data
xyplot(ALL_NY_test$Jobs ~ predict(model_pls,ALL_NY_test),
 ## plot the points (type = 'p') and a background grid ('g')
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Observed")

xyplot(ALL_NY_test$Jobs ~ predict(lm,ALL_NY_test),
 ## plot the points (type = 'p') and a background grid ('g')
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Observed")

xyplot(ALL_NY_test$Jobs ~ predict(lm_cpi_item,ALL_NY_test),
 ## plot the points (type = 'p') and a background grid ('g')
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Observed")


# REssidual Plot with train data
 xyplot(resid(model_pls) ~ predict(model_pls),
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Residuals")
 
  xyplot(resid(lm) ~ predict(lm),
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Residuals")

     xyplot(resid(lm_cpi_item) ~ predict(lm_cpi_item),
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Residuals")
    
   xyplot(resid(lm_cpi2_item) ~ predict(lm_cpi2_item),
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Residuals")

   
      xyplot(resid(lm_cpi2_item) ~ predict(lm_cpi2_item),
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Residuals")
      
 
 # REssidual Plot with Test data
 xyplot(resid(model_pls) ~ predict(model_pls,ALL_NY_test),
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Residuals")
 
  xyplot(resid(model_glm) ~ predict(model_glm,ALL_NY_test),
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Residuals")
  
    xyplot(resid(lm_cpi_item) ~ predict(lm_cpi_item,ALL_NY_test),
 type = c("p", "g"),
 xlab = "Predicted", ylab = "Residuals")
    
    

    
data.frame("LM Model"= defaultSummary(data.frame(obs=ALL_NY_test$Jobs,pred=predict(lm, ALL_NY_test))),"MAPE" =  MLmetrics::MAPE(predict(lm, ALL_NY_test),ALL_NY_test$Jobs))

data.frame("lm_cpi2_item Model"= defaultSummary(data.frame(obs=ALL_NY_test$Jobs,pred=predict(lm_cpi2_item, ALL_NY_test))),"MAPE" =  MLmetrics::MAPE(predict(lm_cpi2_item, ALL_NY_test),ALL_NY_test$Jobs))

data.frame("lm_cpi_item Model"= defaultSummary(data.frame(obs=ALL_NY_test$Jobs,pred=predict(lm_cpi_item, ALL_NY_test))),"MAPE" =  MLmetrics::MAPE(predict(lm_cpi2_item, ALL_NY_test),ALL_NY_test$Jobs))

data.frame("lm_cpi_year Model"= defaultSummary(data.frame(obs=ALL_NY_test$Jobs,pred=predict(lm_cpi_year, ALL_NY_test))),"MAPE" =  MLmetrics::MAPE(predict(lm_cpi_year, ALL_NY_test),ALL_NY_test$Jobs))

data.frame("PLS Model"= defaultSummary(data.frame(obs=ALL_NY_test$Jobs,pred=predict(model_pls, ALL_NY_test))),"MAPE" =  MLmetrics::MAPE(predict(model_pls, ALL_NY_test),ALL_NY_test$Jobs))


cbind(
  data.frame("LM Model"= defaultSummary(data.frame(obs=ALL_NY_test$Jobs,pred=predict(lm, ALL_NY_test))),"MAPE" =  MLmetrics::MAPE(predict(lm, ALL_NY_test),ALL_NY_test$Jobs)),

data.frame("lm_cpi2_item Model"= defaultSummary(data.frame(obs=ALL_NY_test$Jobs,pred=predict(lm_cpi2_item, ALL_NY_test))),"MAPE" =  MLmetrics::MAPE(predict(lm_cpi2_item, ALL_NY_test),ALL_NY_test$Jobs)),

data.frame("lm_cpi_item Model"= defaultSummary(data.frame(obs=ALL_NY_test$Jobs,pred=predict(lm_cpi_item, ALL_NY_test))),"MAPE" =  MLmetrics::MAPE(predict(lm_cpi2_item, ALL_NY_test),ALL_NY_test$Jobs)),

data.frame("lm_cpi_year Model"= defaultSummary(data.frame(obs=ALL_NY_test$Jobs,pred=predict(lm_cpi_year, ALL_NY_test))),"MAPE" =  MLmetrics::MAPE(predict(lm_cpi_year, ALL_NY_test),ALL_NY_test$Jobs)),

data.frame("PLS Model"= defaultSummary(data.frame(obs=ALL_NY_test$Jobs,pred=predict(model_pls, ALL_NY_test))),"MAPE" =  MLmetrics::MAPE(predict(model_pls, ALL_NY_test),ALL_NY_test$Jobs))
)

6 References

How to import multiple .csv files at once?