Dataset

URL: https://clinicaltrials.gov/ct2/results?cntry=US&age_v=&gndr=&type=&rslt=&phase=4&phase=0&phase=1&phase=2&phase=3&Search=Apply

# Read csv. text file (top 1000 searching from clinicaltrials.gov)
csv <- read.csv("C:/Users/Anhuynh/Desktop/Data Science Project/Clinical_Trial_Bitotech/SearchResults.csv")
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dt <- csv[, c(2,5,6,7,8,9,10,13,14,15,16,19,20,21,22,24)]

dt$Start.Date <- mdy(dt$Start.Date)
dt$Primary.Completion.Date <- mdy(dt$Primary.Completion.Date)
dt$Completion.Date <- mdy(dt$Completion.Date)
dt$First.Posted <- mdy(dt$First.Posted)
dt$Last.Update.Posted <- mdy(dt$Last.Update.Posted)

today <- today()

dt <- dt %>%
        mutate(Primary_timediff = difftime(Primary.Completion.Date, Start.Date, units = "days")) %>%
        mutate(Completion_timediff = difftime(Completion.Date, Start.Date, units = "days"))
library(ggplot2)

# First Posted
# Area plot
ggplot(dt, aes(x = First.Posted, y = Primary_timediff)) + 
  geom_area(aes(color = Study.Results, fill = Study.Results), 
            alpha = 0.5, position = position_dodge(0.8)) +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800")) +
  labs( x="First Posted Year",y="Time Diff. of Start vs. Completion (Days)", subtitle = "clinicaltrials.gov - updated August 9, 2021")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

# Last Update Posted
# Area plot
ggplot(dt,aes(x = Last.Update.Posted, y = Primary_timediff)) + 
  geom_area(aes(color = Study.Results, fill = Study.Results), 
            alpha = 0.5, position = position_dodge(0.8)) +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800")) +
  labs( x="Last Update Posted Year",y="Time Diff. of Start vs. Completion (Days)", subtitle = "clinicaltrials.gov - updated August 9, 2021")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

# library
library(ggplot2)
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(dplyr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
library(stringr)
sub = dt[order(dt$Primary_timediff, decreasing = FALSE), ]
dt_pha <- sub[sub$Primary_timediff <= 365, ]

dt_pha <- dt_pha[, c(2,8,17)] 

# Data Viz
ggplot(dt_pha, aes(fill=Phases, y=Primary_timediff, x=Status)) + 
    geom_bar(position="stack", stat="identity") +
    scale_fill_viridis(discrete = T) +
    ggtitle("Status of clinical trials within 01 year") +
    theme_ipsum() +
    xlab("") + 
    labs( x="Status",y="Time Diff. of Start vs. Completion (Days)", subtitle = "clinicaltrials.gov - updated August 9, 2021") +
    theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1)) +
    geom_hline(yintercept=0, linetype="dashed", color = "red")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

library(ggplot2)

sub = dt[order(dt$Primary_timediff, decreasing = FALSE), ]
dt_sub <- head(sub, 20)

ggplot(dt_sub, aes(x = Primary_timediff, y = Sponsor.Collaborators, fill = Phases, label = Primary_timediff)) +
  geom_col() +
  scale_fill_viridis(discrete = T) +
  geom_text(position = position_stack(vjust = 0.5), size = 3, color = "#ffffff") +
  labs(x= "Time Diff (days)", title="Top 20 clinical trials being completed soon", subtitle = "clinicaltrials.gov - updated August 9, 2021") +
  theme(axis.text = element_text(size = 4.75)) 
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

library("tm")
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
library("syuzhet")
## 
## Attaching package: 'syuzhet'
## The following object is masked from 'package:scales':
## 
##     rescale
library("ggplot2")
library(reshape2)
library(tidytext)
# Convert text to corpus
TextDoc <- Corpus(VectorSource(dt$Outcome.Measures))

# Clean corpus
 #Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "/"): transformation drops
## documents
TextDoc <- tm_map(TextDoc, toSpace, "@")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "@"): transformation drops
## documents
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(TextDoc, toSpace, "\\|"): transformation drops
## documents
# Convert the text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(TextDoc, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
## Warning in tm_map.SimpleCorpus(TextDoc, removeNumbers): transformation drops
## documents
# Remove english common stopwords
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(TextDoc, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc <- tm_map(TextDoc, removeWords, c("s", "company", "team")) 
## Warning in tm_map.SimpleCorpus(TextDoc, removeWords, c("s", "company", "team")):
## transformation drops documents
# Remove punctuations
TextDoc <- tm_map(TextDoc, removePunctuation)
## Warning in tm_map.SimpleCorpus(TextDoc, removePunctuation): transformation drops
## documents
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
## Warning in tm_map.SimpleCorpus(TextDoc, stripWhitespace): transformation drops
## documents
# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)
## Warning in tm_map.SimpleCorpus(TextDoc, stemDocument): transformation drops
## documents
# Build a term-document matrix
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
dtm_m <- as.matrix(TextDoc_dtm)
# Sort by descreasing value of frequency
dtm_v <- sort(rowSums(dtm_m),decreasing=TRUE)
dtm_df <- data.frame(word = names(dtm_v),freq=dtm_v)
# Plot the most frequent words
barplot(dtm_df[1:5,]$freq, las = 2, names.arg = dtm_df[1:5,]$word,
        col ="lightgreen", main ="Top 5 most frequent words in Outcome measures ",
        ylab = "Word frequencies")

#generate word cloud
set.seed(1234)
wordcloud(words = dtm_df$word, freq = dtm_df$freq, min.freq = 5,
          max.words=100, random.order=FALSE, rot.per=0.40, 
          colors=brewer.pal(8, "Dark2"))

Emotion Classification

d <- get_nrc_sentiment(dt$Outcome.Measures)
# head(d,10) - to see top 10 lines of the get_nrc_sentiment dataframe
head (d,10)
##    anger anticipation disgust fear joy sadness surprise trust negative positive
## 1      2            3       2    2   1       3        0     2        2        2
## 2      3            1       3    3   1       4        0     1        3        1
## 3      2            4       2    3   1       3        1     1        4        4
## 4      0            1       0    0   0       2        0     0        3        1
## 5      0            0       0    0   0       0        0     1        0        0
## 6      0            0       0    1   0       0        0     0        1        1
## 7      1            1       1    1   1       1        0     3        2        5
## 8      1            1       1    2   1       2        0     1        1        2
## 9      1            1       1    2   0       1        0     0        2        1
## 10     1            1       1    1   1       2        0     3        1        3
# This bar plot allows for a quick and easy comparison of the proportion of words associated with each emotion in the text.
#Plot two - count of words associated with each sentiment, expressed as a percentage
barplot(
  sort(colSums(prop.table(d[, 1:8]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Emotions in description of Outcome measures", xlab="Percentage"
)