This interactive Shiny application enables an in-depth exploration of drug effectiveness and side effects using the UCI Drug Review dataset. It provides a range of analytical features, such as sentiment analysis of user reviews, visualization of drug ratings distribution, and a comparison of side effects via word clouds. Users can filter data by conditions and drugs, offering insights into the drug’s effectiveness, common side effects, and statistical trends across various conditions. The application aims to support healthcare professionals and researchers in evaluating drug performance based on real-world user feedback.
Sentiment Analysis: Utilized multiple sentiment lexicons to categorize drug reviews into positive and negative sentiments, allowing for a detailed understanding of user experiences.
Word Cloud Visualizations: Created dynamic word clouds representing common side effects, helping users quickly identify the most frequent concerns or benefits.
Statistical Analysis: Applied ANOVA to examine differences in drug effectiveness based on various conditions, providing statistical backing to user reviews.
Interactive Dashboard: Developed a Shiny-based interactive interface that allows users to select specific drugs, conditions, and statistics to explore tailored data insights.
Comprehensive Drug Comparison: Enabled users to view top-rated drugs per condition and compare their effectiveness across different parameters.
This project offers a robust, interactive tool for analyzing drug effectiveness and side effects. By integrating sentiment analysis, statistical techniques, and user-friendly visualizations, it provides users with actionable insights to make informed decisions about drug usage and safety. This dashboard serves as an invaluable resource for healthcare professionals, researchers, and consumers alike, providing a deeper understanding of drug performance and side effect profiles.
install.packages(c(“dplyr”, “ggplot2”, “tidyr”, “tidytext”, “wordcloud”, “RColorBrewer”, “recommenderlab”, “shiny”, “tm”, “slam”))
# Load Required Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(recommenderlab)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: arules
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
## Loading required package: proxy
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
library(shiny)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
##
## Attaching package: 'tm'
## The following object is masked from 'package:arules':
##
## inspect
library(slam)
# Step 1: Load the UCI Dataset
setwd("C:/Users/OMPRAKASH/R Studio/Drug Effectiveness")
drug_data <- read.delim("drugLibTrain_raw.tsv", sep = "\t", header = TRUE)
# Data Preprocessing
summary(drug_data)
## X urlDrugName rating effectiveness
## Min. : 0 Length:3107 Min. : 1.000 Length:3107
## 1st Qu.:1062 Class :character 1st Qu.: 5.000 Class :character
## Median :2092 Mode :character Median : 8.000 Mode :character
## Mean :2081 Mean : 7.006
## 3rd Qu.:3092 3rd Qu.: 9.000
## Max. :4161 Max. :10.000
## sideEffects condition benefitsReview sideEffectsReview
## Length:3107 Length:3107 Length:3107 Length:3107
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## commentsReview
## Length:3107
## Class :character
## Mode :character
##
##
##
drug_data <- drug_data %>% drop_na()
# Combine review columns into a single 'allReviews' column
drug_data$allReviews <- paste(drug_data$benefitsReview, drug_data$sideEffectsReview, drug_data$commentsReview, sep = " ")
# Normalize the combined reviews
drug_data$allReviews <- tolower(drug_data$allReviews)
drug_data$allReviews <- gsub("[[:punct:]]", "", drug_data$allReviews)
# Remove stop words and extra spaces
stopwords_custom <- c(stopwords("en"), "drug", "use", "effectiveness", "side", "affects", "treatment", "patients")
drug_data$allReviews <- gsub("http\\S+|www\\S+", "", drug_data$allReviews) # Remove URLs
drug_data$allReviews <- gsub("\\s+", " ", drug_data$allReviews) # Remove extra spaces
# Step 2: Sentiment Analysis with Multiple Lexicons
sentiments <- drug_data %>%
unnest_tokens(word, allReviews) %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment, sort = TRUE)
## Joining with `by = join_by(word)`
# Adding another sentiment lexicon (e.g., NRC)
sentiments_nrc <- drug_data %>%
unnest_tokens(word, allReviews) %>%
inner_join(get_sentiments("nrc"), by = "word", relationship = "many-to-many") %>%
count(sentiment, sort = TRUE)
# Combine both sentiment analyses
sentiments_combined <- bind_rows(sentiments, sentiments_nrc) %>%
count(sentiment, sort = TRUE)
# Visualize Sentiment Analysis
ggplot(sentiments_combined, aes(x = sentiment, y = n, fill = sentiment)) +
geom_bar(stat = "identity") +
labs(title = "Sentiment Analysis of Reviews", x = "Sentiment", y = "Count")
# Step 3: Word Cloud with Custom Stop Words and Warning Fix
word_data <- drug_data %>%
unnest_tokens(word, allReviews) %>%
filter(!word %in% stopwords_custom) # Remove custom stopwords
word_freq <- word_data %>%
count(word, sort = TRUE)
wordcloud(
words = word_freq$word,
freq = word_freq$n,
min.freq = 10,
max.words = 200,
random.order = FALSE,
colors = brewer.pal(8, "Dark2"),
scale = c(3, 0.3)
)
# Step 4: Statistical Analysis (ANOVA)
anova_result <- aov(rating ~ condition, data = drug_data)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## condition 1425 13666 9.590 1.227 2.78e-05 ***
## Residuals 1680 13127 7.814
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Step 5: EDA - Distribution of Ratings by Condition
ggplot(drug_data, aes(x = rating, fill = condition)) +
geom_histogram(binwidth = 1, alpha = 0.7) +
facet_wrap(~ condition, scales = "free_y") +
labs(title = "Distribution of Ratings Across Conditions", x = "Rating", y = "Count") +
theme_minimal()
# Step 6: Boxplot of Ratings by Condition
ggplot(drug_data, aes(x = condition, y = rating)) +
geom_boxplot(fill = "lightblue", alpha = 0.7) +
labs(title = "Boxplot of Ratings by Condition", x = "Condition", y = "Rating") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Step 7: Create Summary Table for Ratings by Condition
rating_summary <- drug_data %>%
group_by(condition) %>%
summarise(average_rating = mean(rating, na.rm = TRUE),
median_rating = median(rating, na.rm = TRUE),
rating_count = n())
# Print the summary table
print(rating_summary)
## # A tibble: 1,426 × 4
## condition average_rating median_rating rating_count
## <chr> <dbl> <dbl> <int>
## 1 2 compressed discs in neck 9 9 1
## 2 20 year pack a day smoker 10 10 1
## 3 a boil 8 8 1
## 4 a little bit of osteoporosis in th… 8 8 1
## 5 a typical migraines 3 3 1
## 6 abcessed tooth 1 1 1
## 7 abdominal pain 9 9 1
## 8 ac joint dislocation 10 10 1
## 9 achilles tendonitis 9 9 1
## 10 acic reflux 1 1 1
## # ℹ 1,416 more rows
# Step 8: Top Drugs by Condition
top_drugs <- drug_data %>%
group_by(condition, urlDrugName) %>%
summarise(avg_rating = mean(rating, na.rm = TRUE)) %>%
arrange(desc(avg_rating))
## `summarise()` has grouped output by 'condition'. You can override using the
## `.groups` argument.
# Step 9: Side Effect Frequency (Word Cloud for Side Effects)
side_effects_data <- drug_data %>%
unnest_tokens(word, sideEffectsReview) %>%
filter(!word %in% stopwords_custom)
side_effect_freq <- side_effects_data %>%
count(word, sort = TRUE)
wordcloud(
words = side_effect_freq$word,
freq = side_effect_freq$n,
min.freq = 10,
max.words = 200,
random.order = FALSE,
colors = brewer.pal(8, "Dark2"),
scale = c(3, 0.3)
)
ui <- fluidPage( titlePanel(“Drug Effectiveness Analysis”), sidebarLayout( sidebarPanel( selectInput(“condition”, “Select Condition”, choices = unique(drug_data\(condition)), selectInput("drugs", "Select Drugs", choices = unique(drug_data\)urlDrugName), multiple = TRUE), selectInput(“statistic”, “Select Statistic”, choices = c(“Effectiveness Distribution”, “Boxplot of Ratings”, “Average Ratings Summary”, “Top Drugs”)) ), mainPanel( plotOutput(“statisticPlot”), tableOutput(“ratingSummaryTable”), tableOutput(“drugComparisonTable”) ) ) )
server <- function(input, output, session) { # Validate inputs observe({ req(input\(statistic, input\)condition) # Ensure inputs are valid })
# Plot Output based on selection output\(statisticPlot <- renderPlot({ req(input\)condition, input$statistic) # Ensure both inputs are available
selected_drugs <- drug_data %>% filter(urlDrugName %in% input$urlDrugName, condition == input$condition)
if(input$statistic == "Effectiveness Distribution") {
ggplot(selected_drugs, aes(x = rating, fill = urlDrugName)) +
geom_histogram(binwidth = 1, alpha = 0.7) +
labs(title = paste("Effectiveness Ratings for Selected Drugs"), x = "Effectiveness", y = "Count")
} else if(input$statistic == "Boxplot of Ratings") {
ggplot(selected_drugs, aes(x = urlDrugName, y = rating)) +
geom_boxplot(fill = "lightblue", alpha = 0.7) +
labs(title = "Boxplot of Ratings for Selected Drugs", x = "Drug", y = "Rating") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
} else if(input$statistic == "Average Ratings Summary") {
condition_summary <- rating_summary %>%
filter(condition == input$condition)
ggplot(condition_summary, aes(x = condition, y = average_rating)) +
geom_bar(stat = "identity", fill = "lightgreen") +
labs(title = paste("Average Rating for", input$condition), x = "Condition", y = "Average Rating")
} else if(input$statistic == "Top Drugs") {
top_drugs_filtered <- top_drugs %>%
filter(condition == input$condition)
ggplot(top_drugs_filtered, aes(x = reorder(urlDrugName, avg_rating), y = avg_rating)) +
geom_bar(stat = "identity", fill = "lightblue") +
labs(title = paste("Top Drugs by Rating for", input$condition), x = "Drug", y = "Average Rating") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
})
# Render Rating Summary Table output$ratingSummaryTable <- renderTable({ rating_summary })
# Render Drug Comparison Table output\(drugComparisonTable <- renderTable({ selected_drugs_summary <- top_drugs %>% filter(condition == input\)condition & urlDrugName %in% input$urlDrugName) selected_drugs_summary }) }
shinyApp(ui = ui, server = server)
rsconnect::deployApp(“.”)