Medium.Term.Rate) is binary.fomc_data <-readRDS(file = "fomc_merged_data_v2.rds")
head(select(fomc_data, Index,year,statement.dates,statement.length,date_mdy,Employment.Growth,Economic.Growth,Inflation,Medium.Term.Rate,Policy.Rate))
## Index year statement.dates statement.length date_mdy Employment.Growth
## 1 1 2007 20070131 181 2007-01-31 Flat
## 2 2 2007 20070321 167 2007-03-21 Flat
## 3 3 2007 20070509 168 2007-05-09 Flat
## 4 4 2007 20070628 179 2007-06-28 Flat
## 5 5 2007 20070807 207 2007-08-07 Flat
## 6 6 2007 20070817 130 2007-08-17 Flat
## Economic.Growth Inflation Medium.Term.Rate Policy.Rate
## 1 Up Down Hawk Flat
## 2 Flat Up Hawk Flat
## 3 Down Up Hawk Flat
## 4 Up Down Hawk Flat
## 5 Up Flat Hawk Flat
## 6 Down Flat Dove Flat
set.seed(1234567)
fomc_Rand <- fomc_data[sample(nrow(fomc_data)),]
the federal open market committee and committee as it is present in all the statementscustomStopWords <- c("the federal open market committee", "committee")
fomc_dataX <- fomc_Rand %>% mutate(statement.content = tolower(statement.content))
fomc_dataX$statement.content <- str_replace_all(fomc_dataX$statement.content, customStopWords, "")
# form a corpus
corpus <- VCorpus(VectorSource(fomc_dataX$statement.content))
# Remove Punctuation
corpus <- tm_map(corpus, content_transformer(removePunctuation))
# Remove numbers
corpus <- tm_map(corpus, removeNumbers)
# Convert to lower case
corpus <- tm_map(corpus, content_transformer(tolower))
# Remove stop words
corpus <- tm_map(corpus, content_transformer(removeWords), stopwords("english"))
##Stemming
corpus <- tm_map(corpus, stemDocument)
# Remove Whitespace
corpus <- tm_map(corpus, stripWhitespace)
# Create Document Term Matrix
dtm <- DocumentTermMatrix(corpus)
# handle sparsity
corpusX <- removeSparseTerms(dtm, 0.95)
# convert to matrix
data_matrix <- as.matrix(corpusX)
Medium.Term.RateMedium.Term.Rate variablemRate <- data_matrix
# attach the 'medium.term.rate' column
mRate_matrix <- cbind(mRate, fomc_dataX$Medium.Term.Rate)
# rename it to 'tone'
colnames(mRate_matrix)[ncol(mRate_matrix)] <- "tone"
# convert to data frame
mRateData <- as.data.frame(mRate_matrix)
# convert 'tone' to lower case and make it a factor column as well
mRateData$tone <- as.factor(tolower(mRateData$tone))
mRate_n <- nrow(mRateData)
mRateTrainVolume <- round(mRate_n * 0.80)
set.seed(314)
mRateTrainIndex <- sample(mRate_n, mRateTrainVolume)
mRateTrain <- mRateData[mRateTrainIndex,]
mRateTest <- mRateData[-mRateTrainIndex,]
mRateModel <- train(tone ~., data = mRateTrain, method = 'svmLinear3')
mRateResult <- predict(mRateModel, newdata = mRateTest)
( mRateStats = confusionMatrix( mRateResult, mRateTest$tone))
## Confusion Matrix and Statistics
##
## Reference
## Prediction dove hawk
## dove 16 0
## hawk 0 4
##
## Accuracy : 1
## 95% CI : (0.8316, 1)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 0.01153
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0
## Specificity : 1.0
## Pos Pred Value : 1.0
## Neg Pred Value : 1.0
## Prevalence : 0.8
## Detection Rate : 0.8
## Detection Prevalence : 0.8
## Balanced Accuracy : 1.0
##
## 'Positive' Class : dove
##
Economic.GrowthEconomic.Growth variableeconGrowth <- data_matrix
# attach the 'Economic.Growth' column
econG_matrix <- cbind(econGrowth, tolower(fomc_dataX$Economic.Growth))
# rename it to 'growth'
colnames(econG_matrix)[ncol(econG_matrix)] <- "egrowth"
# convert to data frame
econData <- as.data.frame(econG_matrix)
# convert 'growth' to a factor column as well
econData$egrowth <- as.factor(econData$egrowth)
econ_n <- nrow(econData)
econTrainVolume <- round(econ_n * 0.58)
set.seed(314)
econTrainIndex <- sample(econ_n, econTrainVolume)
econTrain <- econData[econTrainIndex,]
econTest <- econData[-econTrainIndex,]
econModel <- train(egrowth ~., data = econTrain, method = 'svmLinear3')
econResult <- predict(econModel, newdata = econTest)
(econStats = confusionMatrix( econResult, econTest$egrowth))
## Confusion Matrix and Statistics
##
## Reference
## Prediction down flat up
## down 6 0 1
## flat 0 4 0
## up 4 4 24
##
## Overall Statistics
##
## Accuracy : 0.7907
## 95% CI : (0.6396, 0.8996)
## No Information Rate : 0.5814
## P-Value [Acc > NIR] : 0.003287
##
## Kappa : 0.5913
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: down Class: flat Class: up
## Sensitivity 0.6000 0.50000 0.9600
## Specificity 0.9697 1.00000 0.5556
## Pos Pred Value 0.8571 1.00000 0.7500
## Neg Pred Value 0.8889 0.89744 0.9091
## Prevalence 0.2326 0.18605 0.5814
## Detection Rate 0.1395 0.09302 0.5581
## Detection Prevalence 0.1628 0.09302 0.7442
## Balanced Accuracy 0.7848 0.75000 0.7578
InflationInflation variable# Create Document Term Matrix
dtmI <- DocumentTermMatrix(corpus)
# handle sparsity
corpusI <- removeSparseTerms(dtm, 0.94)
# convert to matrix
data_matrixI <- as.matrix(corpusI)
inflation <- data_matrixI
# attach the 'Inflation' column
inflation_matrix <- cbind(inflation, tolower(fomc_dataX$Inflation))
# rename it to 'inflation'
colnames(inflation_matrix)[ncol(inflation_matrix)] <- "inflation"
# convert to data frame
inflationData <- as.data.frame(inflation_matrix)
# convert 'inflation' to a factor column
inflationData$inflation <- as.factor(inflationData$inflation)
infDataX <- inflationData[, -which(names(inflationData) %in% c("although", "william", "richard", "raphael", "randal", "san", "sarah","sandra", "togeth", "timothi","committe","dudley","esther"))]
inf_n <- nrow(infDataX)
infTrainVolume <- round(inf_n * 0.70)
set.seed(314)
infTrainIndex <- sample(inf_n, infTrainVolume)
infTrain <- infDataX[infTrainIndex,]
infTest <- infDataX[-infTrainIndex,]
inflationModel <- train(inflation ~., data = infTrain, method="svmLinear3")
inflationResult <- predict(inflationModel, newdata = infTest)
( infStats = confusionMatrix( inflationResult, infTest$inflation))
## Confusion Matrix and Statistics
##
## Reference
## Prediction down flat up
## down 11 1 0
## flat 7 7 1
## up 1 1 2
##
## Overall Statistics
##
## Accuracy : 0.6452
## 95% CI : (0.4537, 0.8077)
## No Information Rate : 0.6129
## P-Value [Acc > NIR] : 0.4324
##
## Kappa : 0.4181
##
## Mcnemar's Test P-Value : 0.1386
##
## Statistics by Class:
##
## Class: down Class: flat Class: up
## Sensitivity 0.5789 0.7778 0.66667
## Specificity 0.9167 0.6364 0.92857
## Pos Pred Value 0.9167 0.4667 0.50000
## Neg Pred Value 0.5789 0.8750 0.96296
## Prevalence 0.6129 0.2903 0.09677
## Detection Rate 0.3548 0.2258 0.06452
## Detection Prevalence 0.3871 0.4839 0.12903
## Balanced Accuracy 0.7478 0.7071 0.79762
Employment.GrowthEmployment.Growth variableempGrowth <- data_matrix
# attach the 'Employment.Growth column
emp_matrix <- cbind(empGrowth, tolower(fomc_dataX$Employment.Growth))
# rename it to 'empGrowth'
colnames(emp_matrix)[ncol(emp_matrix)] <- "empGrowth"
# convert to data frame
empData <- as.data.frame(emp_matrix)
# convert 'empGrowth' to a factor column as well
empData$empGrowth <- as.factor(empData$empGrowth)
emp_n <- nrow(empData)
empTrainVolume <- round(emp_n * 0.80)
set.seed(314)
empTrainIndex <- sample(emp_n, empTrainVolume)
empTrain <- empData[empTrainIndex,]
empTest <- empData[-empTrainIndex,]
empModel <- train(empGrowth ~., data = empTrain, method = 'svmLinear3')
empResult <- predict(empModel, newdata = empTest)
( empStats = confusionMatrix( empResult, empTest$empGrowth))
## Confusion Matrix and Statistics
##
## Reference
## Prediction down flat up
## down 4 0 1
## flat 0 4 0
## up 0 1 10
##
## Overall Statistics
##
## Accuracy : 0.9
## 95% CI : (0.683, 0.9877)
## No Information Rate : 0.55
## P-Value [Acc > NIR] : 0.0009274
##
## Kappa : 0.8326
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: down Class: flat Class: up
## Sensitivity 1.0000 0.8000 0.9091
## Specificity 0.9375 1.0000 0.8889
## Pos Pred Value 0.8000 1.0000 0.9091
## Neg Pred Value 1.0000 0.9375 0.8889
## Prevalence 0.2000 0.2500 0.5500
## Detection Rate 0.2000 0.2000 0.5000
## Detection Prevalence 0.2500 0.2000 0.5500
## Balanced Accuracy 0.9688 0.9000 0.8990
Policy.RatePolicy.Rate variableplRate <- data_matrix
# attach the 'Policy.Rate' column
pl_matrix <- cbind(plRate, tolower(fomc_dataX$Policy.Rate))
# rename it to 'empGrowth'
colnames(pl_matrix)[ncol(pl_matrix)] <- "policy"
# convert to data frame
plData <- as.data.frame(pl_matrix)
# convert 'policy' to a factor column as well
plData$policy <- as.factor(plData$policy)
pl_n <- nrow(plData)
plTrainVolume <- round(pl_n * 0.80)
set.seed(314)
plTrainIndex <- sample(pl_n, plTrainVolume)
plTrain <- plData[empTrainIndex,]
plTest <- plData[-empTrainIndex,]
plModel <- train(policy ~., data = plTrain, method = 'svmLinear3')
plResult <- predict(plModel, newdata = plTest)
( plStats = confusionMatrix( plResult, plTest$policy))
## Confusion Matrix and Statistics
##
## Reference
## Prediction flat lower raise
## flat 14 0 1
## lower 0 5 0
## raise 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.95
## 95% CI : (0.7513, 0.9987)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.007637
##
## Kappa : 0.8788
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: flat Class: lower Class: raise
## Sensitivity 1.0000 1.00 0.00
## Specificity 0.8333 1.00 1.00
## Pos Pred Value 0.9333 1.00 NaN
## Neg Pred Value 1.0000 1.00 0.95
## Prevalence 0.7000 0.25 0.05
## Detection Rate 0.7000 0.25 0.00
## Detection Prevalence 0.7500 0.25 0.00
## Balanced Accuracy 0.9167 1.00 0.50
Table of summary
results <- tibble(variable = c("Medium.Term.Rate","Employment.Growth","Economic.Growth","Inflation","Policy.Rate"), modelling = c("80 : 20", "80 : 20", "58 : 42", "70 : 30", "80 : 20"), accuracy= c(100, 90, 79.1, 64.52, 95))
kable(results,
col.names = linebreak(c("Variable", "Modelling (Train : Test)", "Accuracy (%)"), align = "c")) %>%
kable_styling("striped", full_width = T) %>%
column_spec(1:3, bold = T, color = "#000") %>%
row_spec(1:5, bold = T, color = "#000")
| Variable | Modelling (Train : Test) | Accuracy (%) |
|---|---|---|
| Medium.Term.Rate | 80 : 20 | 100.00 |
| Employment.Growth | 80 : 20 | 90.00 |
| Economic.Growth | 58 : 42 | 79.10 |
| Inflation | 70 : 30 | 64.52 |
| Policy.Rate | 80 : 20 | 95.00 |
For some of the variables, extra fine-tuning of the data was not needed to achieve appreciable accuracy. But for the Economic.Growth variable, we needed to adjust the ratio of training set to test data set to 58:42 to achieve an accuracy of 79.10%. For the Inflation variable we had to adjust sparsity to 0.94, removed 13 unuseful columns, adjusted the ratio of training to test data sets to 70:30 before we could achieve an accuracy of 64.52%
We can conclude that these values obtained, though not perfect, did go a long way to align with the human based scoring/classification of the economy trends based on the variables considered within the selected years. There are much room left for improvements and further analysis but we cannot go beyond this level right now as time will not permit us
Text classification of FOMC statements is not generally a research objective but we think it is worthwhile. Classification addresses a potential need: Can a machine correctly infer the opinion or direction of forward guidance or policy decisions in a structured text by the FOMC? In this regard, the classification problem for the FOMC is isomorphic to the Ham-Spam classification of incoming emails by an email program. However, the reader may object that FOMC statements are not so voluminous to require automated processing. Our response is that FOMC statements are merely the first baby step in a much larger classification problem: the public communications of all FOMC and Federal Reserve system members. As previously explained, the FOMC members give speeches, publish articles, appear on TV interviews. Moreover, FOMC meeting minutes are released several weeks after the policy statement is released. These are much longer and required more effort to read and digest. Also, the FOMC transcripts released several years after the meeting may run to over 100 pages each. They contain word for word replay of the entire meeting (excluding private discussions). Lastly, there are at least 16 relevant central banks around the world of interest. Although the Fed is the world’s more important central bank, the ECB, Bank of England, Bank of Japan, Bank of China, Bank of Australia, Bank of New Zealand, all produce communications. In summary, no single person can read all central bank communications. The ability to extract key messages from plain texts remains a valuable capability.
Our machine learning prediction backtest suggests that automated classification is feasible to detect limited features of a central bank communication. Our algorithm succeeds at detecting medium term rate outlooks, employment growth and policy rate changes. At these tasks, we have attained accuracy rates between 90-100 percent. The most challenging attribute to understand is inflation (64.5%). This is consistent with financial practitioner opinion. Inflation is the most complex of these areas to quantify, control and manage. That is because inflation has 4 distinct aspects: realized inflation (price changes from past surveys), market based real yields of TIPS bonds and inflation swaps, and long term expectations of inflation, inflation measured with or without volatile sectors: food and energy. Because the statements may treat some or all of these aspects we believe accuracy in understanding FOMC inflation views is hard.