Prescription Prediction

# Installing the required packages
library(tidyverse)

## Warning: package 'tidyr' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidyr)
library(dplyr)
library(ggplot2)
library(jsonlite)

## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten

install.packages("readr", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmpYvR2Aq/downloaded_packages

library(readr)
install.packages("tidytext" ,repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmpYvR2Aq/downloaded_packages

library("tidytext")

## Warning: package 'tidytext' was built under R version 4.3.2

install.packages("stringr",repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmpYvR2Aq/downloaded_packages

library(stringr)
install.packages("forecast",repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmpYvR2Aq/downloaded_packages

library("forecast")

## Warning: package 'forecast' was built under R version 4.3.2

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

install.packages("zoo" ,repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmpYvR2Aq/downloaded_packages

library("zoo")

## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# Getting the working directory 
getwd()

## [1] "/Users/ursulapodosenin/Desktop"

# Reading the contents of the tsv file using read_tsv() function and getting a look at the data
drug_reviews<-readr::read_tsv("/Users/ursulapodosenin/Desktop/drugreviews.tsv")

## New names:
## Rows: 53766 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (4): drugName, condition, review, date dbl (3): ...1, rating, usefulCount
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

glimpse(drug_reviews)

## Rows: 53,766
## Columns: 7
## $ ...1        <dbl> 163740, 206473, 159672, 39293, 97768, 208087, 215892, 1698…
## $ drugName    <chr> "Mirtazapine", "Mesalamine", "Bactrim", "Contrave", "Cycla…
## $ condition   <chr> "Depression", "Crohn's Disease, Maintenance", "Urinary Tra…
## $ review      <chr> "\"I&#039;ve tried a few antidepressants over the years (c…
## $ rating      <dbl> 10, 8, 9, 9, 9, 4, 6, 9, 7, 2, 1, 6, 1, 10, 2, 9, 10, 3, 6…
## $ date        <chr> "February 28, 2012", "May 17, 2009", "September 29, 2017",…
## $ usefulCount <dbl> 22, 17, 3, 35, 4, 13, 1, 32, 21, 3, 17, 7, 57, 19, 44, 14,…

# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescription_refills<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptiondata2010.csv"))

# Creating a new data frame with the columns I need 
state<-prescription_refills[,3]
prescription_refills<-prescription_refills[,14:16]
prescription_refills<-cbind(state, prescription_refills)
glimpse(prescription_refills)

## Rows: 52
## Columns: 4
## $ state                                                                                         <chr> …
## $ Percent.of.beneficiaries.filling.at.least.one.prescription.for.an.SSRI.SNRI..2010.            <dbl> …
## $ Percent.of.beneficiaries.filling.at.least.one.prescription.for.a.dementia.medication..2010.   <dbl> …
## $ Percent.of.beneficiaries.filling.at.least.one.prescription.for.a.new.sedative.hypnotic..2010. <dbl> …

# Renaming the columns of the data frame
colnames(prescription_refills) <- c("State", "Percent_SSRISNRI_Prescription", "Percent_Dementia_Prescription", "Percent_SedativeHypnotic_Prescription")

# Plotting prescription refills by state using ggplot2
ggplot(prescription_refills, aes(x = State)) +
  geom_bar(aes(y = Percent_SSRISNRI_Prescription, fill = "SSRI/SNRI"), stat = "identity", position = "dodge") +
  geom_bar(aes(y = Percent_Dementia_Prescription, fill = "Dementia"), stat = "identity", position = "dodge") +
  geom_bar(aes(y = Percent_SedativeHypnotic_Prescription, fill = "Sedative/Hypnotic"), stat = "identity", position = "dodge") +
  labs(title = "Prescription Refills by State",
       y = "Percentage",
       x = "State",
       fill = "Drug Type") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top") +
  scale_fill_manual(values = c("black", "green", "blue"),
                    labels = c("Dementia", "Sedative/Hypnotic", "SSRI/SNRI"))

# Filtering by data selective to New York
ny2010<- prescription_refills[prescription_refills$State== "New York", ]
ny2010

##       State Percent_SSRISNRI_Prescription Percent_Dementia_Prescription
## 33 New York                      18.23855                      7.405458
##    Percent_SedativeHypnotic_Prescription
## 33                              10.50876

This graph shows the percentage of prescription refills by state for each drug category for the year 2010. For the rest of this code, the focus will be on SSRI/SNRI and Sedative/Hypnotic prescriptions from the years 1998 through 2030. The code below extracts data from several medications from the years 1998 through 2023.

# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions2023<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions2023.csv"))
glimpse(prescriptions2023)

## Rows: 116,603
## Columns: 15
## $ utilization_type               <chr> "FFSU", "MCOU", "FFSU", "FFSU", "MCOU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 2143301, 2143380, 2143380, 2143401, 214…
## $ labeler_code                   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code                   <int> 1433, 1433, 1433, 1434, 1434, 1434, 143…
## $ package_size                   <int> 1, 80, 80, 1, 80, 80, 1, 11, 11, 1, 9, …
## $ year                           <int> 2023, 2023, 2023, 2023, 2023, 2023, 202…
## $ quarter                        <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ suppression_used               <chr> "false", "true", "false", "false", "fal…
## $ product_name                   <chr> "TRULICITY ", "TRULICITY ", "TRULICITY …
## $ units_reimbursed               <dbl> 321.00, NA, 38676.43, 337.50, 40.00, 52…
## $ number_of_prescriptions        <dbl> 137, NA, 16023, 134, 13, 20813, 72, NA,…
## $ total_amount_reimbursed        <dbl> 135765.89, NA, 17484846.26, 145817.83, …
## $ medicaid_amount_reimbursed     <dbl> 135765.89, NA, 17413649.43, 145817.83, …
## $ non_medicaid_amount_reimbursed <dbl> 0.00, NA, 71196.83, 0.00, 172856.00, 96…

prescriptions2023$product_name <- as.character(tolower(prescriptions2023$product_name))

# SSRI/SNRIs 2023
prozac2023<-prescriptions2023[prescriptions2023$product_name== "fluoxetine", ]
zoloft2023<-prescriptions2023[prescriptions2023$product_name== "sertraline", ]
lexapro2023<-prescriptions2023[prescriptions2023$product_name== "citalopram", ] 
wellbutrin2023<-prescriptions2023[prescriptions2023$product_name== "wellbutrin", ]
effexor2023<-prescriptions2023[prescriptions2023$product_name== "effexor xr", ]

# Sedatives/Hypnotics 2023
xanax2023<-prescriptions2023[prescriptions2023$product_name== "alprazolam", ]
klonopin2023<-prescriptions2023[prescriptions2023$product_name== "clonazepam", ]

# Creating a data frame with the medications I need for 2023
prescriptions_2023<- rbind(prozac2023, zoloft2023, lexapro2023, wellbutrin2023, effexor2023, xanax2023, klonopin2023)
glimpse(prescriptions_2023)

## Rows: 1,804
## Columns: 15
## $ utilization_type               <chr> "FFSU", "MCOU", "FFSU", "FFSU", "FFSU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 93080701, 93719801, 93719801, 93719805,…
## $ labeler_code                   <int> 93, 93, 93, 93, 93, 121, 121, 378, 378,…
## $ product_code                   <int> 807, 7198, 7198, 7198, 7198, 721, 4721,…
## $ package_size                   <int> 1, 1, 1, 5, 56, 4, 5, 1, 93, 1, 93, 11,…
## $ year                           <int> 2023, 2023, 2023, 2023, 2023, 2023, 202…
## $ quarter                        <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ suppression_used               <chr> "false", "true", "false", "false", "fal…
## $ product_name                   <chr> "fluoxetine", "fluoxetine", "fluoxetine…
## $ units_reimbursed               <dbl> 1485.00, NA, 108179.00, 16757.00, 10729…
## $ number_of_prescriptions        <dbl> 36, NA, 2426, 405, 278, 380, NA, 17, 20…
## $ total_amount_reimbursed        <dbl> 551.98, NA, 32383.48, 5338.32, 3594.51,…
## $ medicaid_amount_reimbursed     <dbl> 546.98, NA, 32272.29, 5338.32, 3594.51,…
## $ non_medicaid_amount_reimbursed <dbl> 5.00, NA, 111.19, 0.00, 0.00, 1014.00, …

# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions2018<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions2018.csv"))
glimpse(prescriptions2018)

## Rows: 183,781
## Columns: 15
## $ utilization_type               <chr> "MCOU", "FFSU", "FFSU", "MCOU", "MCOU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 2143301, 2143301, 2143380, 2143380, 214…
## $ labeler_code                   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code                   <int> 1433, 1433, 1433, 1433, 1434, 1434, 143…
## $ package_size                   <int> 1, 1, 80, 80, 1, 1, 80, 80, 11, 1, 9, 1…
## $ year                           <int> 2018, 2018, 2018, 2018, 2018, 2018, 201…
## $ quarter                        <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used               <chr> "false", "true", "false", "false", "fal…
## $ product_name                   <chr> "TRULICITY ", "TRULICITY ", "TRULICITY …
## $ units_reimbursed               <dbl> 289.0, NA, 784.5, 14002.5, 164.5, 38.0,…
## $ number_of_prescriptions        <dbl> 155, NA, 418, 6914, 83, 19, 8656, 414, …
## $ total_amount_reimbursed        <dbl> 100761.38, NA, 278945.94, 4878626.90, 5…
## $ medicaid_amount_reimbursed     <dbl> 100761.38, NA, 259171.38, 4877385.12, 5…
## $ non_medicaid_amount_reimbursed <dbl> 0.00, NA, 19774.56, 1241.78, 0.00, 0.00…

prescriptions2018$product_name <-tolower(prescriptions2018$product_name)

# SSRI/SNRIs 2018
prozac2018<-prescriptions2018[prescriptions2018$product_name== "fluoxetine", ]
zoloft2018<-prescriptions2018[prescriptions2018$product_name== "sertraline", ]
lexapro2018<-prescriptions2018[prescriptions2018$product_name== "citalopram", ] 
wellbutrin2018<-prescriptions2018[prescriptions2018$product_name== "wellbutrin", ]
effexor2018<-prescriptions2023[prescriptions2018$product_name== "effexor xr", ]

# Sedatives/Hypnotics 2018
xanax2018<-prescriptions2018[prescriptions2018$product_name== "alprazolam", ]
klonopin2018<-prescriptions2018[prescriptions2018$product_name== "clonazepam", ]

# Creating a data frame with the medications I need for 2018
prescriptions_2018<- rbind(prozac2018, zoloft2018, lexapro2018, wellbutrin2018, effexor2018, xanax2018, klonopin2018)
glimpse(prescriptions_2018)

## Rows: 3,650
## Columns: 15
## $ utilization_type               <chr> "MCOU", "FFSU", "MCOU", "FFSU", "MCOU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 93434656, 93610812, 93718810, 93718810,…
## $ labeler_code                   <int> 93, 93, 93, 93, 93, 93, 93, 93, 93, 93,…
## $ product_code                   <int> 4346, 6108, 7188, 7188, 7188, 7188, 719…
## $ package_size                   <int> 56, 12, 10, 10, 56, 56, 1, 1, 5, 56, 56…
## $ year                           <int> 2018, 2018, 2018, 2018, 2018, 2018, 201…
## $ quarter                        <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used               <chr> "true", "true", "false", "true", "false…
## $ product_name                   <chr> "fluoxetine", "fluoxetine", "fluoxetine…
## $ units_reimbursed               <dbl> NA, NA, 667, NA, 1030, NA, 5292, 1290, …
## $ number_of_prescriptions        <dbl> NA, NA, 21, NA, 31, NA, 164, 36, 183, 2…
## $ total_amount_reimbursed        <dbl> NA, NA, 612.97, NA, 1132.75, NA, 1493.1…
## $ medicaid_amount_reimbursed     <dbl> NA, NA, 612.97, NA, 1132.75, NA, 1493.1…
## $ non_medicaid_amount_reimbursed <dbl> NA, NA, 0.00, NA, 0.00, NA, 0.00, 5.73,…

# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions2013<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions2013.csv"))
glimpse(prescriptions2013)

## Rows: 143,226
## Columns: 15
## $ utilization_type               <chr> "FFSU", "MCOU", "MCOU", "FFSU", "FFSU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 2197590, 2197590, 2300475, 2300475, 232…
## $ labeler_code                   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code                   <int> 1975, 1975, 3004, 3004, 3227, 3227, 322…
## $ package_size                   <int> 90, 90, 75, 75, 30, 30, 30, 30, 30, 30,…
## $ year                           <int> 2013, 2013, 2013, 2013, 2013, 2013, 201…
## $ quarter                        <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used               <chr> "false", "false", "true", "true", "fals…
## $ product_name                   <chr> "AXIRON    ", "AXIRON    ", "PROZAC WEE…
## $ units_reimbursed               <dbl> 1260.00, 39740.00, NA, NA, 8670.00, 233…
## $ number_of_prescriptions        <dbl> 14, 406, NA, NA, 208, 606, 1697, 549, 8…
## $ total_amount_reimbursed        <dbl> 5049.96, 162268.17, NA, NA, 59433.88, 1…
## $ medicaid_amount_reimbursed     <dbl> 3834.86, 162268.17, NA, NA, 38849.61, 1…
## $ non_medicaid_amount_reimbursed <dbl> 1215.10, 0.00, NA, NA, 20584.27, 0.00, …

prescriptions2013$product_name <-tolower(prescriptions2013$product_name)

# SSRI/SNRIs 2013
prozac2013<-prescriptions2013[prescriptions2013$product_name== "fluoxetine", ]
zoloft2013<-prescriptions2013[prescriptions2013$product_name== "sertraline", ]
lexapro2013<-prescriptions2013[prescriptions2013$product_name== "citalopram", ] 
wellbutrin2013<-prescriptions2013[prescriptions2013$product_name== "wellbutrin", ]
effexor2013<-prescriptions2013[prescriptions2013$product_name== "effexor xr", ]

# Sedatives/Hypnotics 2013
xanax2013<-prescriptions2013[prescriptions2013$product_name== "alprazolam", ]
klonopin2013<-prescriptions2013[prescriptions2013$product_name== "clonazepam", ]

# Creating a data frame with the medications I need for 2013
prescriptions_2013<- rbind(prozac2013, zoloft2013, lexapro2013, wellbutrin2013, effexor2013, xanax2013, klonopin2013)
glimpse(prescriptions_2013)

## Rows: 3,675
## Columns: 15
## $ utilization_type               <chr> "FFSU", "MCOU", "MCOU", "FFSU", "FFSU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 93434656, 93434656, 93435619, 93435619,…
## $ labeler_code                   <int> 93, 93, 93, 93, 93, 93, 93, 93, 93, 93,…
## $ product_code                   <int> 4346, 4346, 4356, 4356, 4356, 4356, 610…
## $ package_size                   <int> 56, 56, 19, 19, 93, 93, 12, 12, 10, 56,…
## $ year                           <int> 2013, 2013, 2013, 2013, 2013, 2013, 201…
## $ quarter                        <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used               <chr> "false", "false", "true", "true", "fals…
## $ product_name                   <chr> "fluoxetine", "fluoxetine", "fluoxetine…
## $ units_reimbursed               <dbl> 10699.00, 99232.00, NA, NA, 480.00, 673…
## $ number_of_prescriptions        <dbl> 307, 2954, NA, NA, 12, 177, 125, 79, NA…
## $ total_amount_reimbursed        <dbl> 3320.79, 43195.07, NA, NA, 55.20, 563.0…
## $ medicaid_amount_reimbursed     <dbl> 3082.04, 43195.07, NA, NA, 55.20, 563.0…
## $ non_medicaid_amount_reimbursed <dbl> 238.75, 0.00, NA, NA, 0.00, 0.00, 0.00,…

# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions2008<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions2008.csv"))
glimpse(prescriptions2008)

## Rows: 61,253
## Columns: 15
## $ utilization_type               <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 2148501, 2300475, 2322730, 2322830, 232…
## $ labeler_code                   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code                   <int> 1485, 3004, 3227, 3228, 3229, 3230, 323…
## $ package_size                   <int> 1, 75, 30, 30, 30, 30, 30, 30, 30, 30, …
## $ year                           <int> 2008, 2008, 2008, 2008, 2008, 2008, 200…
## $ quarter                        <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used               <chr> "true", "false", "false", "false", "fal…
## $ product_name                   <chr> "CAPASTAT S", "PROZAC WEE", "STRATTERA …
## $ units_reimbursed               <dbl> NA, 5143.00, 44858.00, 120994.11, 17439…
## $ number_of_prescriptions        <dbl> NA, 992, 1050, 2949, 4545, 106, 390, 18…
## $ total_amount_reimbursed        <dbl> NA, 137412.76, 193617.64, 522585.23, 81…
## $ medicaid_amount_reimbursed     <dbl> NA, 135789.87, 180906.88, 479900.83, 76…
## $ non_medicaid_amount_reimbursed <dbl> NA, 1622.89, 12710.76, 42684.40, 45433.…

prescriptions2008$product_name <-tolower(prescriptions2008$product_name)

# SSRI/SNRIs 2008
prozac2008<-prescriptions2008[prescriptions2008$product_name== "fluoxetine", ]
zoloft2008<-prescriptions2008[prescriptions2008$product_name== "sertraline", ]
lexapro2008<-prescriptions2008[prescriptions2008$product_name== "citalopram", ] 
wellbutrin2008<-prescriptions2008[prescriptions2008$product_name== "wellbutrin", ]
effexor2008<-prescriptions2008[prescriptions2008$product_name== "effexor xr", ]

# Sedatives/Hypnotics 2008
xanax2008<-prescriptions2008[prescriptions2008$product_name== "alprazolam", ]
klonopin2008<-prescriptions2008[prescriptions2008$product_name== "clonazepam", ]

# Creating a data frame with the medications I need for 2008
prescriptions_2008<- rbind(prozac2008, zoloft2008, lexapro2008, wellbutrin2008, effexor2008, xanax2008, klonopin2008)
glimpse(prescriptions_2008)

## Rows: 1,881
## Columns: 15
## $ utilization_type               <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 93104201, 93104210, 93104301, 93435601,…
## $ labeler_code                   <int> 93, 93, 93, 93, 93, 93, 93, 93, 121, 17…
## $ product_code                   <int> 1042, 1042, 1043, 4356, 6108, 7188, 719…
## $ package_size                   <int> 1, 10, 1, 1, 12, 56, 1, 56, 4, 46, 60, …
## $ year                           <int> 2008, 2008, 2008, 2008, 2008, 2008, 200…
## $ quarter                        <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used               <chr> "false", "true", "false", "false", "fal…
## $ product_name                   <chr> "fluoxetine", "fluoxetine", "fluoxetine…
## $ units_reimbursed               <dbl> 81960.0, NA, 181085.0, 25597.0, 42417.5…
## $ number_of_prescriptions        <dbl> 2015, NA, 4146, 554, 262, 324, 63, 3463…
## $ total_amount_reimbursed        <dbl> 19970.15, NA, 46725.58, 6491.12, 7535.6…
## $ medicaid_amount_reimbursed     <dbl> 19832.97, NA, 46546.02, 6464.77, 7171.0…
## $ non_medicaid_amount_reimbursed <dbl> 137.18, NA, 179.56, 26.35, 364.68, 69.2…

# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions2003<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions2003.csv"))
glimpse(prescriptions2003)

## Rows: 63,587
## Columns: 15
## $ utilization_type               <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 2035102, 2035302, 2035303, 2036302, 203…
## $ labeler_code                   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code                   <int> 351, 353, 353, 363, 363, 803, 817, 819,…
## $ package_size                   <int> 2, 2, 3, 2, 3, 2, 2, 2, 1, 25, 2, 5, 75…
## $ year                           <int> 2003, 2003, 2003, 2003, 2003, 2003, 200…
## $ quarter                        <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used               <chr> "false", "false", "true", "false", "fal…
## $ product_name                   <chr> "DARVOCET N", "DARVON-N  ", "DARVON-N  …
## $ units_reimbursed               <dbl> 7892, 27795, NA, 26905, 2415, 3000, NA,…
## $ number_of_prescriptions        <dbl> 132, 309, NA, 263, 28, 25, NA, 15, NA, …
## $ total_amount_reimbursed        <dbl> 3402.34, 20719.07, NA, 23183.12, 2102.2…
## $ medicaid_amount_reimbursed     <dbl> 0.00, 20719.07, NA, 0.00, 0.00, 1680.81…
## $ non_medicaid_amount_reimbursed <dbl> 0, 0, NA, 0, 0, 0, NA, 0, NA, NA, 0, NA…

prescriptions2003$product_name <-tolower(prescriptions2003$product_name)

# SSRI/SNRIs 2003
prozac2003<-prescriptions2003[prescriptions2003$product_name== "fluoxetine", ]
zoloft2003<-prescriptions2003[prescriptions2003$product_name== "sertraline", ]
lexapro2003<-prescriptions2003[prescriptions2003$product_name== "citalopram", ] 
wellbutrin2003<-prescriptions2003[prescriptions2003$product_name== "wellbutrin", ]
effexor2003<-prescriptions2003[prescriptions2003$product_name== "effexor xr", ]

# Sedatives/Hypnotics 2003
xanax2003<-prescriptions2003[prescriptions2003$product_name== "alprazolam", ]
klonopin2003<-prescriptions2003[prescriptions2003$product_name== "clonazepam", ]

# Creating a data frame with the medications I need for 2003
prescriptions_2003<- rbind(prozac2003, zoloft2003, lexapro2003, wellbutrin2003, effexor2003, xanax2003, klonopin2003)
glimpse(prescriptions_2003)

## Rows: 842
## Columns: 15
## $ utilization_type               <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 93104201, 93104301, 93610812, 93718810,…
## $ labeler_code                   <int> 93, 93, 93, 93, 93, 93, 121, 172, 172, …
## $ product_code                   <int> 1042, 1043, 6108, 7188, 7188, 7198, 721…
## $ package_size                   <int> 1, 1, 12, 10, 56, 56, 4, 60, 70, 80, 60…
## $ year                           <int> 2003, 2003, 2003, 2003, 2003, 2003, 200…
## $ quarter                        <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used               <chr> "false", "false", "false", "true", "fal…
## $ product_name                   <chr> "fluoxetine", "fluoxetine", "fluoxetine…
## $ units_reimbursed               <dbl> 42836, 196256, 48455, NA, 5513, 99686, …
## $ number_of_prescriptions        <dbl> 1026, 4281, 222, NA, 134, 2737, 33, 110…
## $ total_amount_reimbursed        <dbl> 26816.01, 132589.80, 33290.71, NA, 3255…
## $ medicaid_amount_reimbursed     <dbl> 0.00, 132589.80, 0.00, NA, 0.00, 0.00, …
## $ non_medicaid_amount_reimbursed <dbl> 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0,…

# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions1998<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions1998.csv"))
glimpse(prescriptions1998)

## Rows: 92,938
## Columns: 15
## $ utilization_type               <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 2035102, 2035302, 2035303, 2036302, 203…
## $ labeler_code                   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code                   <int> 351, 353, 353, 363, 363, 363, 640, 803,…
## $ package_size                   <int> 2, 2, 3, 2, 3, 46, 2, 2, 3, 2, 2, 2, 2,…
## $ year                           <int> 1998, 1998, 1998, 1998, 1998, 1998, 199…
## $ quarter                        <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used               <chr> "false", "false", "false", "false", "fa…
## $ product_name                   <chr> "DARVOCET N", "DARVON-N  ", "DARVON-N  …
## $ units_reimbursed               <dbl> 24259, 21505, 1649, 204234, 46603, 1690…
## $ number_of_prescriptions        <dbl> 457, 277, 16, 2662, 571, 18, 97, 119, N…
## $ total_amount_reimbursed        <dbl> 8279.44, 11797.01, 848.99, 118588.31, 2…
## $ medicaid_amount_reimbursed     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0,…
## $ non_medicaid_amount_reimbursed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0,…

prescriptions1998$product_name <-tolower(prescriptions1998$product_name)

# SSRI/SNRIs 1998
prozac1998<-prescriptions1998[prescriptions1998$product_name== "fluoxetine", ]
zoloft1998<-prescriptions1998[prescriptions1998$product_name== "sertraline", ]
lexapro1998<-prescriptions1998[prescriptions1998$product_name== "citalopram", ] 
wellbutrin1998<-prescriptions1998[prescriptions1998$product_name== "wellbutrin", ]
effexor1998<-prescriptions1998[prescriptions1998$product_name== "effexor xr", ]

# Sedatives/Hypnotics 1998
xanax1998<-prescriptions1998[prescriptions1998$product_name== "alprazolam", ]
klonopin1998<-prescriptions1998[prescriptions1998$product_name== "clonazepam", ]

# Creating a data frame with the medications I need for 1998
prescriptions_1998<- rbind(prozac1998, zoloft1998, lexapro1998, wellbutrin1998, effexor1998, xanax1998, klonopin1998)
glimpse(prescriptions_1998)

## Rows: 654
## Columns: 15
## $ utilization_type               <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state                          <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc                            <dbl> 81017755, 81017855, 173013555, 17301775…
## $ labeler_code                   <int> 81, 81, 173, 173, 173, 173, 81, 81, 173…
## $ product_code                   <int> 177, 178, 135, 177, 178, 947, 177, 178,…
## $ package_size                   <int> 55, 55, 55, 55, 55, 55, 55, 55, 55, 55,…
## $ year                           <int> 1998, 1998, 1998, 1998, 1998, 1998, 199…
## $ quarter                        <int> 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, …
## $ suppression_used               <chr> "false", "false", "false", "false", "fa…
## $ product_name                   <chr> "wellbutrin", "wellbutrin", "wellbutrin…
## $ units_reimbursed               <dbl> 23024, 23968, 796913, 259421, 205561, 1…
## $ number_of_prescriptions        <dbl> 370, 369, 14898, 3971, 3110, 2846, 445,…
## $ total_amount_reimbursed        <dbl> 13962.47, 19177.14, 997179.60, 181304.5…
## $ medicaid_amount_reimbursed     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ non_medicaid_amount_reimbursed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Now that the desired data has been extracted, the focus shifts to getting an idea of what the prescription data looks like across the years.

# Combining prescription data from multiple years into a single data frame
prescriptions_total_years <- bind_rows(prescriptions_2023, prescriptions_2018, prescriptions_2013, prescriptions_2008, prescriptions_2003, prescriptions_1998)

# Creating a clean version of the combined data frame by removing rows with any missing values
prescriptions_total_years_clean <- prescriptions_total_years[complete.cases(prescriptions_total_years),]

# Grouping the cleaned data frame by product name and year, then calculating various summary statistics
prescriptions_total_years_clean|>
  group_by(product_name)|>
    summarise(total_prescriptions= sum(number_of_prescriptions),
              average_prescriptions= mean(number_of_prescriptions),
              min_prescriptions= min(number_of_prescriptions),
              max_prescriptions= max(number_of_prescriptions),
              range_prescriptions= range(number_of_prescriptions),
              sd_prescriptions= sd((number_of_prescriptions)
    ))

## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `summarise()` has grouped output by 'product_name'. You can override using the
## `.groups` argument.

## # A tibble: 38 × 7
## # Groups:   product_name [19]
##    product_name total_prescriptions average_prescriptions min_prescriptions
##    <chr>                      <dbl>                 <dbl>             <dbl>
##  1 "albendazol"                 296                  296                296
##  2 "albendazol"                 296                  296                296
##  3 "alprazolam"             1399134                  617.                11
##  4 "alprazolam"             1399134                  617.                11
##  5 "caverject "                  30                   30                 30
##  6 "caverject "                  30                   30                 30
##  7 "citalopram"              864949                  548.                11
##  8 "citalopram"              864949                  548.                11
##  9 "clonazepam"             1733497                  958.                11
## 10 "clonazepam"             1733497                  958.                11
## # ℹ 28 more rows
## # ℹ 3 more variables: max_prescriptions <dbl>, range_prescriptions <dbl>,
## #   sd_prescriptions <dbl>

# Selecting the columns needed for analysis 
year_and_number_of_prescriptions_by_name <- prescriptions_total_years_clean[, c("year", "product_name", "number_of_prescriptions")]
glimpse(year_and_number_of_prescriptions_by_name)

## Rows: 9,626
## Columns: 3
## $ year                    <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023…
## $ product_name            <chr> "fluoxetine", "fluoxetine", "fluoxetine", "flu…
## $ number_of_prescriptions <dbl> 36, 2426, 405, 278, 380, 17, 20, 45, 34, 97, 6…

# Grouping the data by product name and year, calculating the total prescriptions, and arranging the data by product name
data_for_predictions <- year_and_number_of_prescriptions_by_name |> 
  group_by(product_name, year) |> 
  summarise(total_prescriptions = sum(number_of_prescriptions), .groups = 'drop') |> 
  arrange(product_name, desc(year))
glimpse(data_for_predictions)

## Rows: 48
## Columns: 3
## $ product_name        <chr> "albendazol", "alprazolam", "alprazolam", "alprazo…
## $ year                <int> 2023, 2023, 2018, 2013, 2008, 2003, 1998, 2023, 20…
## $ total_prescriptions <dbl> 296, 226100, 338904, 323498, 263901, 160400, 86331…

Now, a linear model will be used to project the number of prescriptions for each medication into the year 2030.

# Filtering the data frame for the relevant product names
selected_names <- c("alprazolam", "citalopram", "clonazepam", "effexor xr", "fluoxetine", "sertraline", "wellbutrin")
filtered_data <- data_for_predictions[data_for_predictions$product_name %in% selected_names, ]

# Fitting a linear regression model for each product
models <- filtered_data |> 
  group_by(product_name) |> 
  do(model = lm(total_prescriptions ~ year, data = .))

# Predicting the total prescriptions for 2030
predictions <- models |> 
  mutate(predicted_prescriptions_2030 = predict(model, newdata = data.frame(year = 2030)))

# Plotting the trend line and predictions
ggplot(filtered_data, aes(x = year, y = total_prescriptions, color = product_name)) +
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE, fullrange = TRUE) +  
  geom_text(data = predictions, aes(label = round(predicted_prescriptions_2030), x = 2030, y = predicted_prescriptions_2030), vjust = -0.5) +  
  labs(title = "Total Prescriptions Trend and Prediction for 2030",
       x = "Year",
       y = "Total Prescriptions") +
  theme_minimal() +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))

## `geom_smooth()` using formula = 'y ~ x'

# Predicting the total prescriptions for 2030
predictions_2030 <- models |>
  mutate(predicted_prescriptions_2030 = predict(model, newdata = data.frame(year = 2030)))
predictions_2030

## # A tibble: 7 × 3
## # Rowwise: 
##   product_name model  predicted_prescriptions_2030
##   <chr>        <list>                        <dbl>
## 1 alprazolam   <lm>                        377372.
## 2 citalopram   <lm>                        201802.
## 3 clonazepam   <lm>                        441702.
## 4 effexor xr   <lm>                        -15392.
## 5 fluoxetine   <lm>                        448136.
## 6 sertraline   <lm>                        825023.
## 7 wellbutrin   <lm>                        -71754.

Based on the graphic and data table, it appears that Wellbutrin is the most likely to see a decrease in presciptions, while Sertraline, also known as Zoloft, is the the most likely to see an increase in prescriptions.

Going in a slightly different direction for a moment, a sentiment analysis will be performed on the reviews that patients left for each drug.

# Obtaining different sentiment analysis libraries
get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

get_sentiments("nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

# Organizing the SSRI's/ SNRIs medications into one data frame
Fluoxetine <- drug_reviews[drug_reviews$drugName == "Prozac", ]
Sertraline <- drug_reviews[drug_reviews$drugName == "Zoloft", ]
Wellbutrin <- drug_reviews[drug_reviews$drugName == "Wellbutrin", ]
Citalopram <- drug_reviews[drug_reviews$drugName == "Celexa", ]
Effexor <- drug_reviews[drug_reviews$drugName == "Effexor", ]
ssrisnri<- rbind(Fluoxetine, Sertraline, Wellbutrin, Citalopram, Effexor)
glimpse(ssrisnri)

## Rows: 810
## Columns: 7
## $ ...1        <dbl> 181313, 181495, 181391, 181344, 181657, 181693, 181273, 18…
## $ drugName    <chr> "Prozac", "Prozac", "Prozac", "Prozac", "Prozac", "Prozac"…
## $ condition   <chr> "Major Depressive Disorde", "Anxiety and Stress", "Anxiety…
## $ review      <chr> "\"I lost my sexuality from the first pill, overnight it w…
## $ rating      <dbl> 1, 6, 6, 8, 10, 7, 2, 9, 9, 10, 10, 5, 9, 10, 10, 9, 10, 8…
## $ date        <chr> "November 17, 2015", "August 8, 2014", "September 13, 2015…
## $ usefulCount <dbl> 24, 51, 38, 54, 193, 40, 35, 160, 115, 167, 50, 27, 102, 1…

# Renaming the prescriptions 
ssrisnri$drugName<-ifelse(ssrisnri$drugName== "Prozac", "fluoxetine", ssrisnri$drugName)
ssrisnri$drugName<-ifelse(ssrisnri$drugName== "Zoloft", "sertraline", ssrisnri$drugName)
ssrisnri$drugName<-ifelse(ssrisnri$drugName== "Celexa", "citalopram", ssrisnri$drugName)
ssrisnri$drugName<-ifelse(ssrisnri$drugName== "Effexor", "effexor xr", ssrisnri$drugName)
ssrisnri$drugName<-ifelse(ssrisnri$drugName== "Wellbutrin", "wellburtin", ssrisnri$drugName)

# Performing the sentiment analysis
ssrisnri_sentiment <- ssrisnri |>
  dplyr::select(drugName, review) |>  
  mutate(review = as.character(review)) |> 
  unnest_tokens(word, review) |>  
  inner_join(get_sentiments("bing")) |> 
  count(drugName, sentiment) |>  
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) |> 
  mutate(sentiment_score = positive - negative)

## Joining with `by = join_by(word)`

# Viewing the resulting data frame and finding the average sentiment score
print(ssrisnri_sentiment)

## # A tibble: 5 × 4
##   drugName   negative positive sentiment_score
##   <chr>         <int>    <int>           <int>
## 1 citalopram      899      567            -332
## 2 effexor xr      470      300            -170
## 3 fluoxetine      897      599            -298
## 4 sertraline     1624      978            -646
## 5 wellburtin      571      396            -175

mean(ssrisnri_sentiment$sentiment_score)

## [1] -324.2

# Creating a bar plot of sentiment scores for each drug
ggplot(ssrisnri_sentiment, aes(x = drugName, y = sentiment_score, fill = sentiment_score > 0)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Sentiment Analysis of SSRI and SNRI Medications",
       x = "Drug Name",
       y = "Sentiment Score",
       fill = "Sentiment") +
  scale_fill_manual(values = c("red", "green"), labels = c("Negative", "Positive")) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

### Based on the sentiment analysis for the SSRI/SNRI drug classes, it appears that sertraline has the greatest negative sentiment overall.

# Organizing the Sedative/Hypnotic Medications into one data
Alprazolam <- drug_reviews[drug_reviews$drugName == "Xanax", ]
Clonazepam <- drug_reviews[drug_reviews$drugName == "Klonopin", ]
sedativehypnotic<-rbind(Alprazolam, Clonazepam)
sedativehypnotic$drugName <- ifelse(sedativehypnotic$drugName == "Xanax", "alprazolam", sedativehypnotic$drugName)
sedativehypnotic$drugName <- ifelse(sedativehypnotic$drugName == "Klonopin", "clonazepam", sedativehypnotic$drugName)
sedativehypnotic

## # A tibble: 270 × 7
##     ...1 drugName   condition     review                rating date  usefulCount
##    <dbl> <chr>      <chr>         <chr>                  <dbl> <chr>       <dbl>
##  1  7953 alprazolam Anxiety       "\"I am a 33 year ol…      9 Apri…          32
##  2  7744 alprazolam Depression    "\"The first time I …     10 Nove…         193
##  3  8095 alprazolam Panic Disorde "\"Efficacy never in…      7 Apri…         120
##  4  7598 alprazolam Anxiety       "\"I have had PTSD a…     10 Augu…          53
##  5  7897 alprazolam Anxiety       "\"Xanax has been a …     10 Janu…          18
##  6  7745 alprazolam Anxiety       "\"I take Xanax very…      9 Octo…          52
##  7  7610 alprazolam Depression    "\"I suffer from soc…     10 May …         115
##  8  7670 alprazolam Panic Disorde "\"The only medicati…     10 Apri…         118
##  9  7992 alprazolam Anxiety       "\"I have been on Xa…     10 June…          24
## 10  7589 alprazolam Anxiety       "\"This drug literal…     10 Octo…          46
## # ℹ 260 more rows

# Performing the sentiment analysis
sedativehypnotic_sentiment <- sedativehypnotic |>
  dplyr::select(drugName, review) |> 
  mutate(review = as.character(review)) |>  
  unnest_tokens(word, review) |> 
  inner_join(get_sentiments("bing")) |>  
  count(drugName, sentiment) |>  
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) |>  
  mutate(sentiment_score = positive - negative)

## Joining with `by = join_by(word)`

# Viewing the resulting data frame and finding the average sentiment score
print(sedativehypnotic_sentiment)

## # A tibble: 2 × 4
##   drugName   negative positive sentiment_score
##   <chr>         <int>    <int>           <int>
## 1 alprazolam      736      443            -293
## 2 clonazepam      568      358            -210

mean(sedativehypnotic_sentiment$sentiment_score)

## [1] -251.5

# Creating a bar plot of sentiment scores for each drug
ggplot(sedativehypnotic_sentiment, aes(x = drugName, y = sentiment_score, fill = sentiment_score > 0)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Sentiment Analysis of Sedative and Hypnotic Medications",
       x = "Drug Name",
       y = "Sentiment Score",
       fill = "Sentiment") +
  scale_fill_manual(values = c("red", "green"), labels = c("Negative", "Positive")) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

### Based on the sentiment analysis for the Sedative/Hypnotic drug classes, it appears that alprazolam, also known as Xanax, has the greatest negative sentiment overall.

Now that the regression and sentiment analysis have been performed, the goal is to look at the two combined and evaluate the trends.

# Making a combined table of all sentiment scores 
sentiment_scores<-rbind(ssrisnri_sentiment, sedativehypnotic_sentiment)

# Viewing sentiment scores and prescription predictions for 2030
sentiment_scores

## # A tibble: 7 × 4
##   drugName   negative positive sentiment_score
##   <chr>         <int>    <int>           <int>
## 1 citalopram      899      567            -332
## 2 effexor xr      470      300            -170
## 3 fluoxetine      897      599            -298
## 4 sertraline     1624      978            -646
## 5 wellburtin      571      396            -175
## 6 alprazolam      736      443            -293
## 7 clonazepam      568      358            -210

predictions_2030

## # A tibble: 7 × 3
## # Rowwise: 
##   product_name model  predicted_prescriptions_2030
##   <chr>        <list>                        <dbl>
## 1 alprazolam   <lm>                        377372.
## 2 citalopram   <lm>                        201802.
## 3 clonazepam   <lm>                        441702.
## 4 effexor xr   <lm>                        -15392.
## 5 fluoxetine   <lm>                        448136.
## 6 sertraline   <lm>                        825023.
## 7 wellbutrin   <lm>                        -71754.

names(predictions_2030)[1]<- "drugName"

merged_data <- merge(sentiment_scores, predictions_2030, by = "drugName", all.x = TRUE)

# Plotting the relationship between sentiment scores and predicted values for 2030 for each drug
ggplot(merged_data, aes(x = predicted_prescriptions_2030, y = sentiment_score, color = drugName)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, aes(group = drugName)) +
  labs(title = "Relationship between Sentiment Scores and Predicted Values for 2030",
       x = "Predicted Total Prescriptions for 2030",
       y = "Sentiment Score",
       color = "Drug Name") +
  theme_classic() +
  scale_x_continuous(labels = scales::comma)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 1 rows containing missing values (`geom_point()`).

This graphic provides an interesting interpretation of the data. While Sertraline was predicted to have the greatest number of prescriptions going into the year 2030, it also had the most negative sentiment overall. Conversely, Effexor had the best sentiment rating of all the medications, but was unlikely to see an upward trend in prescriptions going into the year 2030. From a graphical standpoint, both Sertraline and Effexor appear to be outliers. The remaining data all appear to group around a negative sentiment score of about 300 and an average predicted prescription number of around 400,000. Some data points were excluded, as they contained missing and undefined values.