knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE)

Load the required packages

library(topicmodels)
library(quanteda)
library(tidyverse)
library(tidytext)
library(seededlda)
library(topicdoc)
library(LDAvis)
library(topicmodels)
library(ldatuning)
library(topicdoc)
library(stm)
library(keyATM)
library(kableExtra)

About the Data

Problem Description

Awards_df <- readtext::readtext("data/award_data.csv", 
                            text_field = "Abstract")
glimpse(Awards_df)
## Rows: 195,885
## Columns: 39
## $ doc_id                                  <chr> "award_data.csv.1", "award_dat…
## $ text                                    <chr> "In the last decade there has …
## $ Company                                 <chr> "0 Base Design, LLC", "1109 Br…
## $ Award.Title                             <chr> "Opportunistic Passive RF Dete…
## $ Agency                                  <chr> "Department of Defense", "Depa…
## $ Branch                                  <chr> "Air Force", "Air Force", "Air…
## $ Phase                                   <chr> "Phase II", "Phase I", "Phase …
## $ Program                                 <chr> "STTR", "STTR", "STTR", "SBIR"…
## $ Agency.Tracking.Number                  <chr> "FX20D-TCSO1-0113", "FX21B-TCS…
## $ Contract                                <chr> "FA864922P0007", "FA864922P008…
## $ Proposal.Award.Date                     <chr> "10/13/2021", "11/02/2021", "1…
## $ Contract.End.Date                       <chr> "01/13/2023", "02/02/2022", "0…
## $ Solicitation.Number                     <chr> "X20.D", "X21.B", "X21.B", "X2…
## $ Solicitation.Year                       <int> 2020, 2021, 2021, 2021, 2021, …
## $ Topic.Code                              <chr> "AFX20D-TCSO1", "AF21B-TCSO1",…
## $ Award.Year                              <int> 2022, 2022, 2022, 2022, 2022, …
## $ Award.Amount                            <chr> "749,922", "49,999", "49,999",…
## $ Duns                                    <chr> "'080483074'", "'081037737'", …
## $ HUBZone.Owned                           <chr> "N", "N", "N", "N", "N", "N", …
## $ Socially.and.Economically.Disadvantaged <chr> "N", "N", "N", "N", "N", "N", …
## $ Women.Owned                             <chr> "Y", "N", "N", "N", "N", "N", …
## $ Number.Employees                        <int> 6, 5, 5, 4, 7, 3, NA, 2, 14, 1…
## $ Company.Website                         <chr> "www.0basedesign.com", "http:/…
## $ Address1                                <chr> "5107 Unicon Dr Unit K", "300 …
## $ Address2                                <chr> "", "", "", "", "", "", "", ""…
## $ City                                    <chr> "Wake Forest", "louisville", "…
## $ State                                   <chr> "NC", "KY", "KY", "WA", "CA", …
## $ Zip                                     <chr> "27587-5020", "40206-1972", "4…
## $ Contact.Name                            <chr> "Joseph Murray", "James Sass",…
## $ Contact.Title                           <chr> "", "", "", "", "", "", "", ""…
## $ Contact.Phone                           <chr> "(919) 606-5330", "(502) 641-1…
## $ Contact.Email                           <chr> "jmurray@0basedesign.com", "ja…
## $ PI.Name                                 <chr> "John Swartz", "James Sass", "…
## $ PI.Title                                <chr> "", "", "", "", "", "", "", ""…
## $ PI.Phone                                <chr> "(919) 889-3361", "(502) 641-1…
## $ PI.Email                                <chr> "john.swartz@wrc-nc.org", "jam…
## $ RI.Name                                 <chr> "Wireless Research Center of N…
## $ RI.POC.Name                             <chr> "John Swartz", "Samuel Riehn",…
## $ RI.POC.Phone                            <chr> "(919) 889-3361", "(573) 803-8…
Awards_corp <- corpus(Awards_df)
Awards_corp
## Corpus consisting of 195,885 documents and 37 docvars.
## award_data.csv.1 :
## "In the last decade there has been an exponential growth in t..."
## 
## award_data.csv.2 :
## "1109 BRAVO’s patented NeuroPak is a light-weight wearable tr..."
## 
## award_data.csv.3 :
## "1109 BRAVO’s Neuropak provides an efficient, effective, and ..."
## 
## award_data.csv.4 :
## "1st1 Technologies (1st1) proposes the integration of our dis..."
## 
## award_data.csv.5 :
## "Glaucoma, a disease that damages your eye's optic nerve, aff..."
## 
## award_data.csv.6 :
## "2X4Lab will investigate the applicability of its Cloud-Based..."
## 
## [ reached max_ndoc ... 195,879 more documents ]
StopWords = c("one", "two", "three", "first", "second", "third",
                "include", "includes", "including", "included", 
                "use", "uses", "using", "used", "comprises",
                "comprising", "based", "may", "also", "can", "whether",
                "phase","low","high","proposed","system*","unit",
                "quality","integration","unit","mirror","emerging",
                "temporal","extraction","stabilization","production","produce",
                "truma","proper")
Awards_toks <- tokens(
  Awards_corp, 
  remove_punct = TRUE,
  remove_numbers = TRUE, 
  remove_symbols = TRUE,
  remove_url = TRUE,
  split_hyphens = FALSE)
Awards_toks
## Tokens consisting of 195,885 documents and 37 docvars.
## award_data.csv.1 :
##  [1] "In"          "the"         "last"        "decade"      "there"      
##  [6] "has"         "been"        "an"          "exponential" "growth"     
## [11] "in"          "the"        
## [ ... and 339 more ]
## 
## award_data.csv.2 :
##  [1] "BRAVO's"      "patented"     "NeuroPak"     "is"           "a"           
##  [6] "light-weight" "wearable"     "training"     "device"       "designed"    
## [11] "with"         "the"         
## [ ... and 20 more ]
## 
## award_data.csv.3 :
##  [1] "BRAVO's"       "Neuropak"      "provides"      "an"           
##  [5] "efficient"     "effective"     "and"           "user-friendly"
##  [9] "approach"      "to"            "human"         "performance"  
## [ ... and 19 more ]
## 
## award_data.csv.4 :
##  [1] "1st1"            "Technologies"    "1st1"            "proposes"       
##  [5] "the"             "integration"     "of"              "our"            
##  [9] "disruptive"      "algorithm-based" "image"           "matching"       
## [ ... and 23 more ]
## 
## award_data.csv.5 :
##  [1] "Glaucoma" "a"        "disease"  "that"     "damages"  "your"    
##  [7] "eye's"    "optic"    "nerve"    "affects"  "more"     "than"    
## [ ... and 29 more ]
## 
## award_data.csv.6 :
##  [1] "2X4Lab"        "will"          "investigate"   "the"          
##  [5] "applicability" "of"            "its"           "Cloud-Based"  
##  [9] "Workflow"      "Automation"    "Platform"      "for"          
## [ ... and 18 more ]
## 
## [ reached max_ndoc ... 195,879 more documents ]
Awards_dfmat <- dfm(Awards_toks, tolower = TRUE) %>%
  dfm_remove(c(stopwords("en"), StopWords)) %>%
  dfm_trim(min_termfreq = 5, min_docfreq = 10)
Awards_dfmat
## Document-feature matrix of: 195,885 documents, 43,892 features (99.82% sparse) and 37 docvars.
##                   features
## docs               last decade exponential growth technology application
##   award_data.csv.1    1      1           1      1          3           1
##   award_data.csv.2    0      0           0      0          0           0
##   award_data.csv.3    0      0           0      0          0           0
##   award_data.csv.4    0      0           0      0          1           0
##   award_data.csv.5    0      0           0      0          0           0
##   award_data.csv.6    0      0           0      0          0           0
##                   features
## docs               unmanned air vehicles uavs
##   award_data.csv.1        2   5        3    9
##   award_data.csv.2        0   0        0    0
##   award_data.csv.3        0   0        0    0
##   award_data.csv.4        0   0        0    0
##   award_data.csv.5        0   0        0    0
##   award_data.csv.6        0   0        0    0
## [ reached max_ndoc ... 195,879 more documents, reached max_nfeat ... 43,882 more features ]
Awards_key_list = list(
  Agriculture = c("manure","produc*", "fungus", "pesticide*", "tractor","soil",
                  "hydroponic*","fruit","vegetable","market","food","water","energy",
                  "rural","farming","Livestock","animal","horticulture","crop","drought",
                  "leaf","weed","organic","irrigation","cultivate","dairy","milk","technology"),
  Technology = c("machine","innovation","innovate","science","research","experience","AI",
                 "center  of excellence","cloud","robotics","IOT","DATA","Analytics","bigdata",
                 "Automation","programming","blockchain","cyber security",
                 "Artifical Intellegence","ml","machinelearning","image processing","NEURAL*"),
  Defence = c("lidar","radar","drone","sensor","beam","nano","micro","navigation",
              "Autonomous", "Nuclear","bioadhesive","weapon","diode","security","simulation",
              "SNIPPER","RFID","radio","nearfield","robotic","aircrafts",
              "light weight","research","real time","nanaoscope","monitoring",
              "deeplearning","Ammunition","roboticarm","antena","satellite","areial*",
              "frequecny","Ballistic","holographic"),
  Health = c("vaccine","robotics","surgery","microscopic*","medecine","cure",
             "oral vaccine","osmosis", "pediatric","cardio*","renal","pharma*",
             "retina","plasma","stem cells","stem","stroke","cell","therapy","phyiso*",
             "genomics","Ayurveda","Probiotics","steroids","corona","HIV
             Blood plasma","ADHD","xray","scanning","endoscopy","MRI","LASER","tissue",
             "roboticarm","covid*","utlrasound"))
keyATM_docs <- keyATM_read(texts = Awards_dfmat)
summary(keyATM_docs)
## keyATM_docs object of: 195885 documents.
## Length of documents:
##   Avg: 109.938
##   Min: 0
##   Max: 1476
##    SD: 79.382
## Number of unique words: 400563

Visualization

awards_key_viz <- visualize_keywords(docs = keyATM_docs,keywords = Awards_key_list)
save_fig(awards_key_viz,"keyword.pdf", width =6.5, height =4)
awards_key_viz

keyATM Base Model

###COMMENTING THE MODEL FITTING AS IT TAKES LONG TIME

# sr_tmod_keyatm_base <- keyATM(docs = keyATM_docs,# text input
#                               no_keyword_topics =2,# number of topics without keywords
#                               keywords = Awards_key_list, # keywords
#                               model ="base",
#                               options = list(seed =123))
# saveRDS(awards_tmod_keyatm_dyn, file = "./model_base.rda")

model_old_base = readRDS("model_base.rda")
kable(top_words(model_old_base,10), format = "html", table.attr = "style='width:30%;'") %>%
  kableExtra::kable_styling()
1_Agriculture 2_Technology 3_Defence 4_Health Other_1 Other_2
energy [✓] data data clinical n design
technology software sensor [✓] patients power materials
project project design treatment applications technology
water [✓] research [✓] technology develop laser performance
process information develop disease optical thermal
power develop performance project technology material
commercial training provide development design ii
cost development sensors cells develop develop
research [2] health ii drug new development
fuel provide development human devices process

keyATM Dynamic Model

vars <- docvars(Awards_corp)
vars_period <- vars %>%
  as_tibble() %>%
  mutate(period = (vars$Award.Year - 1982)) %>% 
  select(period)
vars_period <- arrange(vars_period, period)
###COMMENTING THE MODEL FITTING AS IT TAKES LONG TIME

# awards_tmod_keyatm_dyn <- keyATM(
#   docs = keyATM_docs,
#   no_keyword_topics = 2,
#   keywords = Awards_key_list,
#   model = "dynamic",
#   model_settings = list(time_index = vars_period$period,
#                         num_states = 2), # for HMM
#   options = list(seed = 123))
# saveRDS(awards_tmod_keyatm_dyn, file = "./model.rda")

model_old = readRDS("model.rda")
# plot_timetrend(model_old, time_index_label=vars$year, xlab="Year")
plot_timetrend(model_old, time_index_label=vars$Award.Year[1:model_old$N], xlab="Year")

Results, Summary, Discussion:

kable(top_words(model_old_base,5), format = "html", table.attr = "style='width:30%;'") %>%
  kableExtra::kable_styling()
1_Agriculture 2_Technology 3_Defence 4_Health Other_1 Other_2
energy [✓] data data clinical n design
technology software sensor [✓] patients power materials
project project design treatment applications technology
water [✓] research [✓] technology develop laser performance
process information develop disease optical thermal