project2_CPI

Loading the CPI report

Below I import the csv file containing the CPI report from Github, where I saved the file.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

fileURL <- 'https://raw.githubusercontent.com/stoybis/DATA607Repo/main/news-release-table1-202401.csv'

cpiData <- read_csv(fileURL, skip = 3, col_names = TRUE, show_col_types = FALSE)

## New names:
## • `Unadjusted indexes` -> `Unadjusted indexes...4`
## • `Unadjusted indexes` -> `Unadjusted indexes...5`
## • `Unadjusted indexes` -> `Unadjusted indexes...6`
## • `Unadjusted percent change` -> `Unadjusted percent change...7`
## • `Unadjusted percent change` -> `Unadjusted percent change...8`
## • `Seasonally adjusted percent change` -> `Seasonally adjusted percent
##   change...9`
## • `Seasonally adjusted percent change` -> `Seasonally adjusted percent
##   change...10`
## • `Seasonally adjusted percent change` -> `Seasonally adjusted percent
##   change...11`

head(cpiData)

## # A tibble: 6 × 11
##   `Indent Level` `Expenditure category`      `Relative\nimportance\nDec.\n2023`
##            <dbl> <chr>                                                    <dbl>
## 1             NA <NA>                                                     NA   
## 2             NA <NA>                                                     NA   
## 3              0 All items                                               100   
## 4              1 Food                                                     13.6 
## 5              2 Food at home                                              8.17
## 6              3 Cereals and bakery products                               1.07
## # ℹ 8 more variables: `Unadjusted indexes...4` <chr>,
## #   `Unadjusted indexes...5` <chr>, `Unadjusted indexes...6` <chr>,
## #   `Unadjusted percent change...7` <chr>,
## #   `Unadjusted percent change...8` <chr>,
## #   `Seasonally adjusted percent change...9` <chr>,
## #   `Seasonally adjusted percent change...10` <chr>,
## #   `Seasonally adjusted percent change...11` <chr>

Cleaning and tidying the data

The data can be tidied such that each line is a new observation. For example, for the item food, we can observe different measures in each row (importance, un-adjusted index values, etc).

Below I clean the data. I start by removing the last four rows, which are footnotes.

cpiData <- cpiData[-c(42,43,44,45,46),]

tail(cpiData)

## # A tibble: 6 × 11
##   `Indent Level` `Expenditure category`                  Relative\nimportance\…¹
##            <dbl> <chr>                                                     <dbl>
## 1              4 Physicians' services(1)                                   1.83 
## 2              4 Hospital services(1)(3)                                   1.99 
## 3              3 Transportation services                                   6.29 
## 4              4 Motor vehicle maintenance and repair(1)                   1.23 
## 5              4 Motor vehicle insurance                                   2.79 
## 6              4 Airline fares                                             0.751
## # ℹ abbreviated name: ¹`Relative\nimportance\nDec.\n2023`
## # ℹ 8 more variables: `Unadjusted indexes...4` <chr>,
## #   `Unadjusted indexes...5` <chr>, `Unadjusted indexes...6` <chr>,
## #   `Unadjusted percent change...7` <chr>,
## #   `Unadjusted percent change...8` <chr>,
## #   `Seasonally adjusted percent change...9` <chr>,
## #   `Seasonally adjusted percent change...10` <chr>, …

Next I subset the data frame to grab the indent level, expenditure category, importance, and month over month and year over year un-adjusted percent changes. I rename the columns and remove spaces from column names

cpiDataSubset <- subset(cpiData, select = c(1,2,3,7,8))
names(cpiDataSubset)[3] <- "Importance"
names(cpiDataSubset)[4] <- "Unadj_pct_chg_yoy"
names(cpiDataSubset)[5]  <- "Unadj_pct_chg_mom"

colnames(cpiDataSubset) <- str_remove(colnames(cpiDataSubset), " ")

head(cpiDataSubset)

## # A tibble: 6 × 5
##   IndentLevel Expenditurecategory Importance Unadj_pct_chg_yoy Unadj_pct_chg_mom
##         <dbl> <chr>                    <dbl> <chr>             <chr>            
## 1          NA <NA>                     NA    "Jan.\n2023-\nJa… "Dec.\n2023-\nJa…
## 2          NA <NA>                     NA     <NA>              <NA>            
## 3           0 All items               100    "3.1"             "0.5"            
## 4           1 Food                     13.6  "2.6"             "0.6"            
## 5           2 Food at home              8.17 "1.2"             "0.7"            
## 6           3 Cereals and bakery…       1.07 "1.5"             "0.2"

Next, I remove the first row which has the time period info for the month over month and year over year changes. Then I remove rows with NAs

cpiDataSubset <- cpiDataSubset[-c(1),]

cpiDataSubset <- drop_na(cpiDataSubset)

head(cpiDataSubset)

## # A tibble: 6 × 5
##   IndentLevel Expenditurecategory Importance Unadj_pct_chg_yoy Unadj_pct_chg_mom
##         <dbl> <chr>                    <dbl> <chr>             <chr>            
## 1           0 All items              100     3.1               0.5              
## 2           1 Food                    13.6   2.6               0.6              
## 3           2 Food at home             8.17  1.2               0.7              
## 4           3 Cereals and bakery…      1.07  1.5               0.2              
## 5           3 Meats, poultry, fi…      1.72  -0.9              -0.1             
## 6           3 Dairy and related …      0.748 -1.1              0.4

I then convert the percent change columns to type numeric

cpiDataSubset$Unadj_pct_chg_yoy <- as.numeric(cpiDataSubset$Unadj_pct_chg_yoy)
cpiDataSubset$Unadj_pct_chg_mom <- as.numeric(cpiDataSubset$Unadj_pct_chg_mom)

Deliverable 1

The first deliverable is to see the largest month over month and year over year percent changes by category

First, I pivot the subset data frame to a longer format

cpiDataSubsetLong <- pivot_longer(cpiDataSubset, cols = !c('Expenditurecategory'), names_to = 'variable',values_to = 'value')

I then filter the data frame for the percent change measurements

cpiDataSubsetLongChanges <- cpiDataSubsetLong |> filter(variable == 'Unadj_pct_chg_yoy' | variable == 'Unadj_pct_chg_mom')

Below I visualize the data

ggplot(cpiDataSubsetLongChanges, aes (x = Expenditurecategory,
                                      y = value, fill = variable)) + geom_bar(position = 'dodge', stat = 'identity') +
  coord_flip() + 
  ggtitle('Month over Month and Year over Year percent changes by expenditure category')

There are a variety of interesting changes for the year over year data. The largest decreases come from energy related items such as utility (piped) gas service, fuel oil, motor fuel, gasoline, and energy commodities. This may also explain why airline fares saw a decrease as well, as fuel cost is likely a significant factor for airline fares. The largest year over year change came in motor vehicle insurance.

Deliverable 2

The second deliverable is to see the largest importance for each category.

The cpiDataSubset data frame essentially contains three different tables (four if you count the first row, the total CPI for All items, as a table).

For each table, the indents sum to indent the indent above it. For example, food is one table and indents 2 sum to indent 1 and indents 3 sum to indent 2 within the food table.

Because of this, I will need to break out the data frame into three different tables based on indent, such that there are three separate tables: food, energy, and all items less food and energy.

I do that by finding which rows are indent level 1 and then subsetting based on these values

rowsForTableSplit <- which(cpiDataSubset$IndentLevel==1)

rowsForTableSplit

## [1]  2 11 19

Below I subset into three different tables based on the row info above

foodSubset <- cpiDataSubset |> slice(rowsForTableSplit[1]:(rowsForTableSplit[2]-1))

energySubset <- cpiDataSubset |> slice(rowsForTableSplit[2]:(rowsForTableSplit[3]-1))

otherSubset <- cpiDataSubset |> slice(rowsForTableSplit[3]:nrow(cpiDataSubset))

Starting with the food subset, I tidy the data into a longer format by pivoting and arrange it by the variable to see the largest weights

foodSubsetPivoted <- foodSubset |> pivot_longer(cols = !c('IndentLevel', 'Expenditurecategory'),
                           names_to = 'Variable',
                           values_to = 'Value')

foodSubsetPivoted <- foodSubsetPivoted |> arrange(Variable, IndentLevel)

head(foodSubsetPivoted)

## # A tibble: 6 × 4
##   IndentLevel Expenditurecategory            Variable    Value
##         <dbl> <chr>                          <chr>       <dbl>
## 1           1 Food                           Importance 13.6  
## 2           2 Food at home                   Importance  8.17 
## 3           2 Food away from home(1)         Importance  5.39 
## 4           3 Cereals and bakery products    Importance  1.07 
## 5           3 Meats, poultry, fish, and eggs Importance  1.72 
## 6           3 Dairy and related products     Importance  0.748

From this data frame, I find the max importance, which I will use in setting the scale for the y-axis in a plot

maxWeightFood <- foodSubsetPivoted |> 
  filter(Variable =='Importance') |> 
  summarise(max_val = max(Value)) |> pull(max_val)

Below I visualize the weights for the food subset. The weights of higher indent levels are larger, but what this graph shows is how each indent level is made up. So for example, Food has a 13% importance in total CPI. Within Food, Food at home and Food away from home have an 8% and 5% weight. And then indent 3 shows the make up of food at home.

foodSubsetPivoted |> filter(Variable =='Importance') |> 
  ggplot(mapping = aes(x=IndentLevel, y= Value, fill = fct_inorder(Expenditurecategory))) + 
  geom_bar(stat = 'Identity', position = 'dodge') + 
  scale_y_continuous(breaks = seq(0, maxWeightFood, by = 1)) + 
  ggtitle('Importance to CPI by Indent Level by category')

I repeat the above process for the Energy and Other categories. I consolidate the code blocks into one for each subset.

Below is the above process but for Energy:

#Energy subset
energySubsetPivoted <- energySubset |> pivot_longer(cols = !c('IndentLevel', 'Expenditurecategory'),
                                                names_to = 'Variable',
                                                values_to = 'Value')

#arrange in tidy format to see largest weights
energySubsetPivoted <- energySubsetPivoted |> arrange(Variable, IndentLevel)

head(energySubsetPivoted)

## # A tibble: 6 × 4
##   IndentLevel Expenditurecategory Variable   Value
##         <dbl> <chr>               <chr>      <dbl>
## 1           1 Energy              Importance 6.66 
## 2           2 Energy commodities  Importance 3.54 
## 3           2 Energy services     Importance 3.12 
## 4           3 Fuel oil            Importance 0.084
## 5           3 Motor fuel          Importance 3.37 
## 6           3 Electricity         Importance 2.43

maxWeightEnergy <- energySubsetPivoted |> 
  filter(Variable =='Importance') |> 
  summarise(max_val = max(Value)) |> pull(max_val)

#visualize weights
energySubsetPivoted |> filter(Variable =='Importance') |> 
  ggplot(mapping = aes(x=IndentLevel, y= Value, fill = fct_inorder(Expenditurecategory))) + 
  geom_bar(stat = 'Identity', position = 'dodge') + 
  scale_y_continuous(breaks = seq(0, maxWeightEnergy, by = 1)) + 
  ggtitle('Importance to CPI by Indent Level by category')

Other subset

#other subset
otherSubsetPivoted <- otherSubset |> pivot_longer(cols = !c('IndentLevel', 'Expenditurecategory'),
                                                    names_to = 'Variable',
                                                    values_to = 'Value')

#arrange in tidy format to see largest weights
otherSubsetPivoted <- otherSubsetPivoted |> arrange(Variable, IndentLevel)

head(otherSubsetPivoted)

## # A tibble: 6 × 4
##   IndentLevel Expenditurecategory                          Variable   Value
##         <dbl> <chr>                                        <chr>      <dbl>
## 1           1 All items less food and energy               Importance 79.8 
## 2           2 Commodities less food and energy commodities Importance 18.9 
## 3           2 Services less energy services                Importance 60.9 
## 4           3 Apparel                                      Importance  2.51
## 5           3 New vehicles                                 Importance  3.68
## 6           3 Used cars and trucks                         Importance  2.01

maxWeightOther <- otherSubsetPivoted |> 
  filter(Variable =='Importance') |> 
  summarise(max_val = max(Value)) |> pull(max_val)

#visualize weights
otherSubsetPivoted |> filter(Variable =='Importance') |> 
  ggplot(mapping = aes(x=IndentLevel, y= Value, fill = fct_inorder(Expenditurecategory))) + 
  geom_bar(stat = 'Identity', position = 'dodge') + 
  scale_y_continuous(breaks = seq(0, maxWeightOther, by = 5)) + 
  ggtitle('Importance to CPI by Indent Level by category')

project2_CPI_report

Semyon Toybis

2024-03-02

Loading the CPI report

Cleaning and tidying the data

Deliverable 1

Deliverable 2