Description

This interactive Shiny application enables an in-depth exploration of drug effectiveness and side effects using the UCI Drug Review dataset. It provides a range of analytical features, such as sentiment analysis of user reviews, visualization of drug ratings distribution, and a comparison of side effects via word clouds. Users can filter data by conditions and drugs, offering insights into the drug’s effectiveness, common side effects, and statistical trends across various conditions. The application aims to support healthcare professionals and researchers in evaluating drug performance based on real-world user feedback.

Key Achievements

Sentiment Analysis: Utilized multiple sentiment lexicons to categorize drug reviews into positive and negative sentiments, allowing for a detailed understanding of user experiences.
Word Cloud Visualizations: Created dynamic word clouds representing common side effects, helping users quickly identify the most frequent concerns or benefits.
Statistical Analysis: Applied ANOVA to examine differences in drug effectiveness based on various conditions, providing statistical backing to user reviews.
Interactive Dashboard: Developed a Shiny-based interactive interface that allows users to select specific drugs, conditions, and statistics to explore tailored data insights.
Comprehensive Drug Comparison: Enabled users to view top-rated drugs per condition and compare their effectiveness across different parameters.

Conclusion

This project offers a robust, interactive tool for analyzing drug effectiveness and side effects. By integrating sentiment analysis, statistical techniques, and user-friendly visualizations, it provides users with actionable insights to make informed decisions about drug usage and safety. This dashboard serves as an invaluable resource for healthcare professionals, researchers, and consumers alike, providing a deeper understanding of drug performance and side effect profiles.

Install the required libraries

install.packages(c(“dplyr”, “ggplot2”, “tidyr”, “tidytext”, “wordcloud”, “RColorBrewer”, “recommenderlab”, “shiny”, “tm”, “slam”))

# Load Required Libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyr)
library(tidytext)
library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)
library(recommenderlab)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loading required package: arules

## 
## Attaching package: 'arules'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

## Loading required package: proxy

## 
## Attaching package: 'proxy'

## The following object is masked from 'package:Matrix':
## 
##     as.matrix

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

## Registered S3 methods overwritten by 'registry':
##   method               from 
##   print.registry_field proxy
##   print.registry_entry proxy

library(shiny)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## 
## Attaching package: 'tm'

## The following object is masked from 'package:arules':
## 
##     inspect

library(slam)

# Step 1: Load the UCI Dataset

setwd("C:/Users/OMPRAKASH/R Studio/Drug Effectiveness")

drug_data <- read.delim("drugLibTrain_raw.tsv", sep = "\t", header = TRUE)


# Data Preprocessing

summary(drug_data)

##        X        urlDrugName            rating       effectiveness     
##  Min.   :   0   Length:3107        Min.   : 1.000   Length:3107       
##  1st Qu.:1062   Class :character   1st Qu.: 5.000   Class :character  
##  Median :2092   Mode  :character   Median : 8.000   Mode  :character  
##  Mean   :2081                      Mean   : 7.006                     
##  3rd Qu.:3092                      3rd Qu.: 9.000                     
##  Max.   :4161                      Max.   :10.000                     
##  sideEffects         condition         benefitsReview     sideEffectsReview 
##  Length:3107        Length:3107        Length:3107        Length:3107       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  commentsReview    
##  Length:3107       
##  Class :character  
##  Mode  :character  
##                    
##                    
##

drug_data <- drug_data %>% drop_na()

# Combine review columns into a single 'allReviews' column

drug_data$allReviews <- paste(drug_data$benefitsReview, drug_data$sideEffectsReview, drug_data$commentsReview, sep = " ")


# Normalize the combined reviews

drug_data$allReviews <- tolower(drug_data$allReviews)

drug_data$allReviews <- gsub("[[:punct:]]", "", drug_data$allReviews)


# Remove stop words and extra spaces

stopwords_custom <- c(stopwords("en"), "drug", "use", "effectiveness", "side", "affects", "treatment", "patients")
drug_data$allReviews <- gsub("http\\S+|www\\S+", "", drug_data$allReviews)  # Remove URLs
drug_data$allReviews <- gsub("\\s+", " ", drug_data$allReviews)  # Remove extra spaces

# Step 2: Sentiment Analysis with Multiple Lexicons

sentiments <- drug_data %>%
  unnest_tokens(word, allReviews) %>%
  inner_join(get_sentiments("bing")) %>%
  count(sentiment, sort = TRUE)

## Joining with `by = join_by(word)`

# Adding another sentiment lexicon (e.g., NRC)

sentiments_nrc <- drug_data %>%
  unnest_tokens(word, allReviews) %>%
  inner_join(get_sentiments("nrc"), by = "word", relationship = "many-to-many") %>%
  count(sentiment, sort = TRUE)


# Combine both sentiment analyses

sentiments_combined <- bind_rows(sentiments, sentiments_nrc) %>%
  count(sentiment, sort = TRUE)

# Visualize Sentiment Analysis

ggplot(sentiments_combined, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Sentiment Analysis of Reviews", x = "Sentiment", y = "Count")

# Step 3: Word Cloud with Custom Stop Words and Warning Fix

word_data <- drug_data %>%
  unnest_tokens(word, allReviews) %>%
  filter(!word %in% stopwords_custom)  # Remove custom stopwords

word_freq <- word_data %>%
  count(word, sort = TRUE)

wordcloud(
  words = word_freq$word,
  freq = word_freq$n,
  min.freq = 10,
  max.words = 200,
  random.order = FALSE,
  colors = brewer.pal(8, "Dark2"),
  scale = c(3, 0.3)
)

# Step 4: Statistical Analysis (ANOVA)

anova_result <- aov(rating ~ condition, data = drug_data)
summary(anova_result)

##               Df Sum Sq Mean Sq F value   Pr(>F)    
## condition   1425  13666   9.590   1.227 2.78e-05 ***
## Residuals   1680  13127   7.814                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Step 5: EDA - Distribution of Ratings by Condition

ggplot(drug_data, aes(x = rating, fill = condition)) +
  geom_histogram(binwidth = 1, alpha = 0.7) +
  facet_wrap(~ condition, scales = "free_y") +
  labs(title = "Distribution of Ratings Across Conditions", x = "Rating", y = "Count") +
  theme_minimal()

# Step 6: Boxplot of Ratings by Condition

ggplot(drug_data, aes(x = condition, y = rating)) +
  geom_boxplot(fill = "lightblue", alpha = 0.7) +
  labs(title = "Boxplot of Ratings by Condition", x = "Condition", y = "Rating") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Step 7: Create Summary Table for Ratings by Condition

rating_summary <- drug_data %>%
  group_by(condition) %>%
  summarise(average_rating = mean(rating, na.rm = TRUE), 
            median_rating = median(rating, na.rm = TRUE),
            rating_count = n())

# Print the summary table

print(rating_summary)

## # A tibble: 1,426 × 4
##    condition                           average_rating median_rating rating_count
##    <chr>                                        <dbl>         <dbl>        <int>
##  1 2 compressed discs in neck                       9             9            1
##  2 20 year pack a day smoker                       10            10            1
##  3 a boil                                           8             8            1
##  4 a little bit of osteoporosis in th…              8             8            1
##  5 a typical migraines                              3             3            1
##  6 abcessed tooth                                   1             1            1
##  7 abdominal pain                                   9             9            1
##  8 ac joint dislocation                            10            10            1
##  9 achilles tendonitis                              9             9            1
## 10 acic reflux                                      1             1            1
## # ℹ 1,416 more rows

# Step 8: Top Drugs by Condition

top_drugs <- drug_data %>%
  group_by(condition, urlDrugName) %>%
  summarise(avg_rating = mean(rating, na.rm = TRUE)) %>%
  arrange(desc(avg_rating))

## `summarise()` has grouped output by 'condition'. You can override using the
## `.groups` argument.

# Step 9: Side Effect Frequency (Word Cloud for Side Effects)

side_effects_data <- drug_data %>%
  unnest_tokens(word, sideEffectsReview) %>%
  filter(!word %in% stopwords_custom)

side_effect_freq <- side_effects_data %>%
  count(word, sort = TRUE)

wordcloud(
  words = side_effect_freq$word,
  freq = side_effect_freq$n,
  min.freq = 10,
  max.words = 200,
  random.order = FALSE,
  colors = brewer.pal(8, "Dark2"),
  scale = c(3, 0.3)
)

Shiny App Code

Step 10: Create an Interactive Dashboard (Shiny)

ui <- fluidPage( titlePanel(“Drug Effectiveness Analysis”), sidebarLayout( sidebarPanel( selectInput(“condition”, “Select Condition”, choices = unique(drug_data$condition)), selectInput("drugs", "Select Drugs", choices = unique(drug_data$urlDrugName), multiple = TRUE), selectInput(“statistic”, “Select Statistic”, choices = c(“Effectiveness Distribution”, “Boxplot of Ratings”, “Average Ratings Summary”, “Top Drugs”)) ), mainPanel( plotOutput(“statisticPlot”), tableOutput(“ratingSummaryTable”), tableOutput(“drugComparisonTable”) ) ) )

server <- function(input, output, session) { # Validate inputs observe({ req(input$statistic, input$condition) # Ensure inputs are valid })

# Plot Output based on selection output$statisticPlot <- renderPlot({ req(input$condition, input$statistic) # Ensure both inputs are available

selected_drugs <- drug_data %>% filter(urlDrugName %in% input$urlDrugName, condition == input$condition)

if(input$statistic == "Effectiveness Distribution") {
  ggplot(selected_drugs, aes(x = rating, fill = urlDrugName)) +
    geom_histogram(binwidth = 1, alpha = 0.7) +
    labs(title = paste("Effectiveness Ratings for Selected Drugs"), x = "Effectiveness", y = "Count")
} else if(input$statistic == "Boxplot of Ratings") {
  ggplot(selected_drugs, aes(x = urlDrugName, y = rating)) +
    geom_boxplot(fill = "lightblue", alpha = 0.7) +
    labs(title = "Boxplot of Ratings for Selected Drugs", x = "Drug", y = "Rating") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
} else if(input$statistic == "Average Ratings Summary") {
  condition_summary <- rating_summary %>%
    filter(condition == input$condition)
  ggplot(condition_summary, aes(x = condition, y = average_rating)) +
    geom_bar(stat = "identity", fill = "lightgreen") +
    labs(title = paste("Average Rating for", input$condition), x = "Condition", y = "Average Rating")
} else if(input$statistic == "Top Drugs") {
  top_drugs_filtered <- top_drugs %>%
    filter(condition == input$condition)
  ggplot(top_drugs_filtered, aes(x = reorder(urlDrugName, avg_rating), y = avg_rating)) +
    geom_bar(stat = "identity", fill = "lightblue") +
    labs(title = paste("Top Drugs by Rating for", input$condition), x = "Drug", y = "Average Rating") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

})

# Render Rating Summary Table output$ratingSummaryTable <- renderTable({ rating_summary })

# Render Drug Comparison Table output$drugComparisonTable <- renderTable({ selected_drugs_summary <- top_drugs %>% filter(condition == input$condition & urlDrugName %in% input$urlDrugName) selected_drugs_summary }) }

shinyApp(ui = ui, server = server)

rsconnect::deployApp(“.”)