knitr::opts_chunk$set(echo = TRUE,message = FALSE,warning = FALSE)
library(topicmodels)
library(quanteda)
library(tidyverse)
library(tidytext)
library(seededlda)
library(topicdoc)
library(LDAvis)
library(topicmodels)
library(ldatuning)
library(topicdoc)
library(stm)
library(keyATM)
library(kableExtra)
Find the top keywords in each keyword topics Agriculture,Technology, Defense, Health.
Investigate and analyze how the prevalence of topics change over time.
Read Data Using readtext
All the data which we have used is also available in the below location. https://drive.google.com/file/d/1WesyjYfkqLuePTHvu-KuZGidsMJ9WAFP/view?usp=sharing
Awards_df <- readtext::readtext("data/award_data.csv",
text_field = "Abstract")
glimpse(Awards_df)
## Rows: 195,885
## Columns: 39
## $ doc_id <chr> "award_data.csv.1", "award_dat…
## $ text <chr> "In the last decade there has …
## $ Company <chr> "0 Base Design, LLC", "1109 Br…
## $ Award.Title <chr> "Opportunistic Passive RF Dete…
## $ Agency <chr> "Department of Defense", "Depa…
## $ Branch <chr> "Air Force", "Air Force", "Air…
## $ Phase <chr> "Phase II", "Phase I", "Phase …
## $ Program <chr> "STTR", "STTR", "STTR", "SBIR"…
## $ Agency.Tracking.Number <chr> "FX20D-TCSO1-0113", "FX21B-TCS…
## $ Contract <chr> "FA864922P0007", "FA864922P008…
## $ Proposal.Award.Date <chr> "10/13/2021", "11/02/2021", "1…
## $ Contract.End.Date <chr> "01/13/2023", "02/02/2022", "0…
## $ Solicitation.Number <chr> "X20.D", "X21.B", "X21.B", "X2…
## $ Solicitation.Year <int> 2020, 2021, 2021, 2021, 2021, …
## $ Topic.Code <chr> "AFX20D-TCSO1", "AF21B-TCSO1",…
## $ Award.Year <int> 2022, 2022, 2022, 2022, 2022, …
## $ Award.Amount <chr> "749,922", "49,999", "49,999",…
## $ Duns <chr> "'080483074'", "'081037737'", …
## $ HUBZone.Owned <chr> "N", "N", "N", "N", "N", "N", …
## $ Socially.and.Economically.Disadvantaged <chr> "N", "N", "N", "N", "N", "N", …
## $ Women.Owned <chr> "Y", "N", "N", "N", "N", "N", …
## $ Number.Employees <int> 6, 5, 5, 4, 7, 3, NA, 2, 14, 1…
## $ Company.Website <chr> "www.0basedesign.com", "http:/…
## $ Address1 <chr> "5107 Unicon Dr Unit K", "300 …
## $ Address2 <chr> "", "", "", "", "", "", "", ""…
## $ City <chr> "Wake Forest", "louisville", "…
## $ State <chr> "NC", "KY", "KY", "WA", "CA", …
## $ Zip <chr> "27587-5020", "40206-1972", "4…
## $ Contact.Name <chr> "Joseph Murray", "James Sass",…
## $ Contact.Title <chr> "", "", "", "", "", "", "", ""…
## $ Contact.Phone <chr> "(919) 606-5330", "(502) 641-1…
## $ Contact.Email <chr> "jmurray@0basedesign.com", "ja…
## $ PI.Name <chr> "John Swartz", "James Sass", "…
## $ PI.Title <chr> "", "", "", "", "", "", "", ""…
## $ PI.Phone <chr> "(919) 889-3361", "(502) 641-1…
## $ PI.Email <chr> "john.swartz@wrc-nc.org", "jam…
## $ RI.Name <chr> "Wireless Research Center of N…
## $ RI.POC.Name <chr> "John Swartz", "Samuel Riehn",…
## $ RI.POC.Phone <chr> "(919) 889-3361", "(573) 803-8…
Awards_corp <- corpus(Awards_df)
Awards_corp
## Corpus consisting of 195,885 documents and 37 docvars.
## award_data.csv.1 :
## "In the last decade there has been an exponential growth in t..."
##
## award_data.csv.2 :
## "1109 BRAVO’s patented NeuroPak is a light-weight wearable tr..."
##
## award_data.csv.3 :
## "1109 BRAVO’s Neuropak provides an efficient, effective, and ..."
##
## award_data.csv.4 :
## "1st1 Technologies (1st1) proposes the integration of our dis..."
##
## award_data.csv.5 :
## "Glaucoma, a disease that damages your eye's optic nerve, aff..."
##
## award_data.csv.6 :
## "2X4Lab will investigate the applicability of its Cloud-Based..."
##
## [ reached max_ndoc ... 195,879 more documents ]
StopWords = c("one", "two", "three", "first", "second", "third",
"include", "includes", "including", "included",
"use", "uses", "using", "used", "comprises",
"comprising", "based", "may", "also", "can", "whether",
"phase","low","high","proposed","system*","unit",
"quality","integration","unit","mirror","emerging",
"temporal","extraction","stabilization","production","produce",
"truma","proper")
Awards_toks <- tokens(
Awards_corp,
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
remove_url = TRUE,
split_hyphens = FALSE)
Awards_toks
## Tokens consisting of 195,885 documents and 37 docvars.
## award_data.csv.1 :
## [1] "In" "the" "last" "decade" "there"
## [6] "has" "been" "an" "exponential" "growth"
## [11] "in" "the"
## [ ... and 339 more ]
##
## award_data.csv.2 :
## [1] "BRAVO's" "patented" "NeuroPak" "is" "a"
## [6] "light-weight" "wearable" "training" "device" "designed"
## [11] "with" "the"
## [ ... and 20 more ]
##
## award_data.csv.3 :
## [1] "BRAVO's" "Neuropak" "provides" "an"
## [5] "efficient" "effective" "and" "user-friendly"
## [9] "approach" "to" "human" "performance"
## [ ... and 19 more ]
##
## award_data.csv.4 :
## [1] "1st1" "Technologies" "1st1" "proposes"
## [5] "the" "integration" "of" "our"
## [9] "disruptive" "algorithm-based" "image" "matching"
## [ ... and 23 more ]
##
## award_data.csv.5 :
## [1] "Glaucoma" "a" "disease" "that" "damages" "your"
## [7] "eye's" "optic" "nerve" "affects" "more" "than"
## [ ... and 29 more ]
##
## award_data.csv.6 :
## [1] "2X4Lab" "will" "investigate" "the"
## [5] "applicability" "of" "its" "Cloud-Based"
## [9] "Workflow" "Automation" "Platform" "for"
## [ ... and 18 more ]
##
## [ reached max_ndoc ... 195,879 more documents ]
Awards_dfmat <- dfm(Awards_toks, tolower = TRUE) %>%
dfm_remove(c(stopwords("en"), StopWords)) %>%
dfm_trim(min_termfreq = 5, min_docfreq = 10)
Awards_dfmat
## Document-feature matrix of: 195,885 documents, 43,892 features (99.82% sparse) and 37 docvars.
## features
## docs last decade exponential growth technology application
## award_data.csv.1 1 1 1 1 3 1
## award_data.csv.2 0 0 0 0 0 0
## award_data.csv.3 0 0 0 0 0 0
## award_data.csv.4 0 0 0 0 1 0
## award_data.csv.5 0 0 0 0 0 0
## award_data.csv.6 0 0 0 0 0 0
## features
## docs unmanned air vehicles uavs
## award_data.csv.1 2 5 3 9
## award_data.csv.2 0 0 0 0
## award_data.csv.3 0 0 0 0
## award_data.csv.4 0 0 0 0
## award_data.csv.5 0 0 0 0
## award_data.csv.6 0 0 0 0
## [ reached max_ndoc ... 195,879 more documents, reached max_nfeat ... 43,882 more features ]
Awards_key_list = list(
Agriculture = c("manure","produc*", "fungus", "pesticide*", "tractor","soil",
"hydroponic*","fruit","vegetable","market","food","water","energy",
"rural","farming","Livestock","animal","horticulture","crop","drought",
"leaf","weed","organic","irrigation","cultivate","dairy","milk","technology"),
Technology = c("machine","innovation","innovate","science","research","experience","AI",
"center of excellence","cloud","robotics","IOT","DATA","Analytics","bigdata",
"Automation","programming","blockchain","cyber security",
"Artifical Intellegence","ml","machinelearning","image processing","NEURAL*"),
Defence = c("lidar","radar","drone","sensor","beam","nano","micro","navigation",
"Autonomous", "Nuclear","bioadhesive","weapon","diode","security","simulation",
"SNIPPER","RFID","radio","nearfield","robotic","aircrafts",
"light weight","research","real time","nanaoscope","monitoring",
"deeplearning","Ammunition","roboticarm","antena","satellite","areial*",
"frequecny","Ballistic","holographic"),
Health = c("vaccine","robotics","surgery","microscopic*","medecine","cure",
"oral vaccine","osmosis", "pediatric","cardio*","renal","pharma*",
"retina","plasma","stem cells","stem","stroke","cell","therapy","phyiso*",
"genomics","Ayurveda","Probiotics","steroids","corona","HIV
Blood plasma","ADHD","xray","scanning","endoscopy","MRI","LASER","tissue",
"roboticarm","covid*","utlrasound"))
keyATM_docs <- keyATM_read(texts = Awards_dfmat)
summary(keyATM_docs)
## keyATM_docs object of: 195885 documents.
## Length of documents:
## Avg: 109.938
## Min: 0
## Max: 1476
## SD: 79.382
## Number of unique words: 400563
awards_key_viz <- visualize_keywords(docs = keyATM_docs,keywords = Awards_key_list)
save_fig(awards_key_viz,"keyword.pdf", width =6.5, height =4)
awards_key_viz
###COMMENTING THE MODEL FITTING AS IT TAKES LONG TIME
# sr_tmod_keyatm_base <- keyATM(docs = keyATM_docs,# text input
# no_keyword_topics =2,# number of topics without keywords
# keywords = Awards_key_list, # keywords
# model ="base",
# options = list(seed =123))
# saveRDS(awards_tmod_keyatm_dyn, file = "./model_base.rda")
model_old_base = readRDS("model_base.rda")
kable(top_words(model_old_base,10), format = "html", table.attr = "style='width:30%;'") %>%
kableExtra::kable_styling()
| 1_Agriculture | 2_Technology | 3_Defence | 4_Health | Other_1 | Other_2 |
|---|---|---|---|---|---|
| energy [✓] | data | data | clinical | n | design |
| technology | software | sensor [✓] | patients | power | materials |
| project | project | design | treatment | applications | technology |
| water [✓] | research [✓] | technology | develop | laser | performance |
| process | information | develop | disease | optical | thermal |
| power | develop | performance | project | technology | material |
| commercial | training | provide | development | design | ii |
| cost | development | sensors | cells | develop | develop |
| research [2] | health | ii | drug | new | development |
| fuel | provide | development | human | devices | process |
vars <- docvars(Awards_corp)
vars_period <- vars %>%
as_tibble() %>%
mutate(period = (vars$Award.Year - 1982)) %>%
select(period)
vars_period <- arrange(vars_period, period)
###COMMENTING THE MODEL FITTING AS IT TAKES LONG TIME
# awards_tmod_keyatm_dyn <- keyATM(
# docs = keyATM_docs,
# no_keyword_topics = 2,
# keywords = Awards_key_list,
# model = "dynamic",
# model_settings = list(time_index = vars_period$period,
# num_states = 2), # for HMM
# options = list(seed = 123))
# saveRDS(awards_tmod_keyatm_dyn, file = "./model.rda")
model_old = readRDS("model.rda")
# plot_timetrend(model_old, time_index_label=vars$year, xlab="Year")
plot_timetrend(model_old, time_index_label=vars$Award.Year[1:model_old$N], xlab="Year")
kable(top_words(model_old_base,5), format = "html", table.attr = "style='width:30%;'") %>%
kableExtra::kable_styling()
| 1_Agriculture | 2_Technology | 3_Defence | 4_Health | Other_1 | Other_2 |
|---|---|---|---|---|---|
| energy [✓] | data | data | clinical | n | design |
| technology | software | sensor [✓] | patients | power | materials |
| project | project | design | treatment | applications | technology |
| water [✓] | research [✓] | technology | develop | laser | performance |
| process | information | develop | disease | optical | thermal |