# Installing the required packages
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(ggplot2)
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
install.packages("readr", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmpYvR2Aq/downloaded_packages
library(readr)
install.packages("tidytext" ,repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmpYvR2Aq/downloaded_packages
library("tidytext")
## Warning: package 'tidytext' was built under R version 4.3.2
install.packages("stringr",repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmpYvR2Aq/downloaded_packages
library(stringr)
install.packages("forecast",repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmpYvR2Aq/downloaded_packages
library("forecast")
## Warning: package 'forecast' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
install.packages("zoo" ,repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmpYvR2Aq/downloaded_packages
library("zoo")
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Getting the working directory
getwd()
## [1] "/Users/ursulapodosenin/Desktop"
# Reading the contents of the tsv file using read_tsv() function and getting a look at the data
drug_reviews<-readr::read_tsv("/Users/ursulapodosenin/Desktop/drugreviews.tsv")
## New names:
## Rows: 53766 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (4): drugName, condition, review, date dbl (3): ...1, rating, usefulCount
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
glimpse(drug_reviews)
## Rows: 53,766
## Columns: 7
## $ ...1 <dbl> 163740, 206473, 159672, 39293, 97768, 208087, 215892, 1698…
## $ drugName <chr> "Mirtazapine", "Mesalamine", "Bactrim", "Contrave", "Cycla…
## $ condition <chr> "Depression", "Crohn's Disease, Maintenance", "Urinary Tra…
## $ review <chr> "\"I've tried a few antidepressants over the years (c…
## $ rating <dbl> 10, 8, 9, 9, 9, 4, 6, 9, 7, 2, 1, 6, 1, 10, 2, 9, 10, 3, 6…
## $ date <chr> "February 28, 2012", "May 17, 2009", "September 29, 2017",…
## $ usefulCount <dbl> 22, 17, 3, 35, 4, 13, 1, 32, 21, 3, 17, 7, 57, 19, 44, 14,…
# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescription_refills<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptiondata2010.csv"))
# Creating a new data frame with the columns I need
state<-prescription_refills[,3]
prescription_refills<-prescription_refills[,14:16]
prescription_refills<-cbind(state, prescription_refills)
glimpse(prescription_refills)
## Rows: 52
## Columns: 4
## $ state <chr> …
## $ Percent.of.beneficiaries.filling.at.least.one.prescription.for.an.SSRI.SNRI..2010. <dbl> …
## $ Percent.of.beneficiaries.filling.at.least.one.prescription.for.a.dementia.medication..2010. <dbl> …
## $ Percent.of.beneficiaries.filling.at.least.one.prescription.for.a.new.sedative.hypnotic..2010. <dbl> …
# Renaming the columns of the data frame
colnames(prescription_refills) <- c("State", "Percent_SSRISNRI_Prescription", "Percent_Dementia_Prescription", "Percent_SedativeHypnotic_Prescription")
# Plotting prescription refills by state using ggplot2
ggplot(prescription_refills, aes(x = State)) +
geom_bar(aes(y = Percent_SSRISNRI_Prescription, fill = "SSRI/SNRI"), stat = "identity", position = "dodge") +
geom_bar(aes(y = Percent_Dementia_Prescription, fill = "Dementia"), stat = "identity", position = "dodge") +
geom_bar(aes(y = Percent_SedativeHypnotic_Prescription, fill = "Sedative/Hypnotic"), stat = "identity", position = "dodge") +
labs(title = "Prescription Refills by State",
y = "Percentage",
x = "State",
fill = "Drug Type") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top") +
scale_fill_manual(values = c("black", "green", "blue"),
labels = c("Dementia", "Sedative/Hypnotic", "SSRI/SNRI"))
# Filtering by data selective to New York
ny2010<- prescription_refills[prescription_refills$State== "New York", ]
ny2010
## State Percent_SSRISNRI_Prescription Percent_Dementia_Prescription
## 33 New York 18.23855 7.405458
## Percent_SedativeHypnotic_Prescription
## 33 10.50876
# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions2023<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions2023.csv"))
glimpse(prescriptions2023)
## Rows: 116,603
## Columns: 15
## $ utilization_type <chr> "FFSU", "MCOU", "FFSU", "FFSU", "MCOU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 2143301, 2143380, 2143380, 2143401, 214…
## $ labeler_code <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code <int> 1433, 1433, 1433, 1434, 1434, 1434, 143…
## $ package_size <int> 1, 80, 80, 1, 80, 80, 1, 11, 11, 1, 9, …
## $ year <int> 2023, 2023, 2023, 2023, 2023, 2023, 202…
## $ quarter <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ suppression_used <chr> "false", "true", "false", "false", "fal…
## $ product_name <chr> "TRULICITY ", "TRULICITY ", "TRULICITY …
## $ units_reimbursed <dbl> 321.00, NA, 38676.43, 337.50, 40.00, 52…
## $ number_of_prescriptions <dbl> 137, NA, 16023, 134, 13, 20813, 72, NA,…
## $ total_amount_reimbursed <dbl> 135765.89, NA, 17484846.26, 145817.83, …
## $ medicaid_amount_reimbursed <dbl> 135765.89, NA, 17413649.43, 145817.83, …
## $ non_medicaid_amount_reimbursed <dbl> 0.00, NA, 71196.83, 0.00, 172856.00, 96…
prescriptions2023$product_name <- as.character(tolower(prescriptions2023$product_name))
# SSRI/SNRIs 2023
prozac2023<-prescriptions2023[prescriptions2023$product_name== "fluoxetine", ]
zoloft2023<-prescriptions2023[prescriptions2023$product_name== "sertraline", ]
lexapro2023<-prescriptions2023[prescriptions2023$product_name== "citalopram", ]
wellbutrin2023<-prescriptions2023[prescriptions2023$product_name== "wellbutrin", ]
effexor2023<-prescriptions2023[prescriptions2023$product_name== "effexor xr", ]
# Sedatives/Hypnotics 2023
xanax2023<-prescriptions2023[prescriptions2023$product_name== "alprazolam", ]
klonopin2023<-prescriptions2023[prescriptions2023$product_name== "clonazepam", ]
# Creating a data frame with the medications I need for 2023
prescriptions_2023<- rbind(prozac2023, zoloft2023, lexapro2023, wellbutrin2023, effexor2023, xanax2023, klonopin2023)
glimpse(prescriptions_2023)
## Rows: 1,804
## Columns: 15
## $ utilization_type <chr> "FFSU", "MCOU", "FFSU", "FFSU", "FFSU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 93080701, 93719801, 93719801, 93719805,…
## $ labeler_code <int> 93, 93, 93, 93, 93, 121, 121, 378, 378,…
## $ product_code <int> 807, 7198, 7198, 7198, 7198, 721, 4721,…
## $ package_size <int> 1, 1, 1, 5, 56, 4, 5, 1, 93, 1, 93, 11,…
## $ year <int> 2023, 2023, 2023, 2023, 2023, 2023, 202…
## $ quarter <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ suppression_used <chr> "false", "true", "false", "false", "fal…
## $ product_name <chr> "fluoxetine", "fluoxetine", "fluoxetine…
## $ units_reimbursed <dbl> 1485.00, NA, 108179.00, 16757.00, 10729…
## $ number_of_prescriptions <dbl> 36, NA, 2426, 405, 278, 380, NA, 17, 20…
## $ total_amount_reimbursed <dbl> 551.98, NA, 32383.48, 5338.32, 3594.51,…
## $ medicaid_amount_reimbursed <dbl> 546.98, NA, 32272.29, 5338.32, 3594.51,…
## $ non_medicaid_amount_reimbursed <dbl> 5.00, NA, 111.19, 0.00, 0.00, 1014.00, …
# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions2018<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions2018.csv"))
glimpse(prescriptions2018)
## Rows: 183,781
## Columns: 15
## $ utilization_type <chr> "MCOU", "FFSU", "FFSU", "MCOU", "MCOU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 2143301, 2143301, 2143380, 2143380, 214…
## $ labeler_code <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code <int> 1433, 1433, 1433, 1433, 1434, 1434, 143…
## $ package_size <int> 1, 1, 80, 80, 1, 1, 80, 80, 11, 1, 9, 1…
## $ year <int> 2018, 2018, 2018, 2018, 2018, 2018, 201…
## $ quarter <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used <chr> "false", "true", "false", "false", "fal…
## $ product_name <chr> "TRULICITY ", "TRULICITY ", "TRULICITY …
## $ units_reimbursed <dbl> 289.0, NA, 784.5, 14002.5, 164.5, 38.0,…
## $ number_of_prescriptions <dbl> 155, NA, 418, 6914, 83, 19, 8656, 414, …
## $ total_amount_reimbursed <dbl> 100761.38, NA, 278945.94, 4878626.90, 5…
## $ medicaid_amount_reimbursed <dbl> 100761.38, NA, 259171.38, 4877385.12, 5…
## $ non_medicaid_amount_reimbursed <dbl> 0.00, NA, 19774.56, 1241.78, 0.00, 0.00…
prescriptions2018$product_name <-tolower(prescriptions2018$product_name)
# SSRI/SNRIs 2018
prozac2018<-prescriptions2018[prescriptions2018$product_name== "fluoxetine", ]
zoloft2018<-prescriptions2018[prescriptions2018$product_name== "sertraline", ]
lexapro2018<-prescriptions2018[prescriptions2018$product_name== "citalopram", ]
wellbutrin2018<-prescriptions2018[prescriptions2018$product_name== "wellbutrin", ]
effexor2018<-prescriptions2023[prescriptions2018$product_name== "effexor xr", ]
# Sedatives/Hypnotics 2018
xanax2018<-prescriptions2018[prescriptions2018$product_name== "alprazolam", ]
klonopin2018<-prescriptions2018[prescriptions2018$product_name== "clonazepam", ]
# Creating a data frame with the medications I need for 2018
prescriptions_2018<- rbind(prozac2018, zoloft2018, lexapro2018, wellbutrin2018, effexor2018, xanax2018, klonopin2018)
glimpse(prescriptions_2018)
## Rows: 3,650
## Columns: 15
## $ utilization_type <chr> "MCOU", "FFSU", "MCOU", "FFSU", "MCOU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 93434656, 93610812, 93718810, 93718810,…
## $ labeler_code <int> 93, 93, 93, 93, 93, 93, 93, 93, 93, 93,…
## $ product_code <int> 4346, 6108, 7188, 7188, 7188, 7188, 719…
## $ package_size <int> 56, 12, 10, 10, 56, 56, 1, 1, 5, 56, 56…
## $ year <int> 2018, 2018, 2018, 2018, 2018, 2018, 201…
## $ quarter <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used <chr> "true", "true", "false", "true", "false…
## $ product_name <chr> "fluoxetine", "fluoxetine", "fluoxetine…
## $ units_reimbursed <dbl> NA, NA, 667, NA, 1030, NA, 5292, 1290, …
## $ number_of_prescriptions <dbl> NA, NA, 21, NA, 31, NA, 164, 36, 183, 2…
## $ total_amount_reimbursed <dbl> NA, NA, 612.97, NA, 1132.75, NA, 1493.1…
## $ medicaid_amount_reimbursed <dbl> NA, NA, 612.97, NA, 1132.75, NA, 1493.1…
## $ non_medicaid_amount_reimbursed <dbl> NA, NA, 0.00, NA, 0.00, NA, 0.00, 5.73,…
# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions2013<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions2013.csv"))
glimpse(prescriptions2013)
## Rows: 143,226
## Columns: 15
## $ utilization_type <chr> "FFSU", "MCOU", "MCOU", "FFSU", "FFSU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 2197590, 2197590, 2300475, 2300475, 232…
## $ labeler_code <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code <int> 1975, 1975, 3004, 3004, 3227, 3227, 322…
## $ package_size <int> 90, 90, 75, 75, 30, 30, 30, 30, 30, 30,…
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 201…
## $ quarter <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used <chr> "false", "false", "true", "true", "fals…
## $ product_name <chr> "AXIRON ", "AXIRON ", "PROZAC WEE…
## $ units_reimbursed <dbl> 1260.00, 39740.00, NA, NA, 8670.00, 233…
## $ number_of_prescriptions <dbl> 14, 406, NA, NA, 208, 606, 1697, 549, 8…
## $ total_amount_reimbursed <dbl> 5049.96, 162268.17, NA, NA, 59433.88, 1…
## $ medicaid_amount_reimbursed <dbl> 3834.86, 162268.17, NA, NA, 38849.61, 1…
## $ non_medicaid_amount_reimbursed <dbl> 1215.10, 0.00, NA, NA, 20584.27, 0.00, …
prescriptions2013$product_name <-tolower(prescriptions2013$product_name)
# SSRI/SNRIs 2013
prozac2013<-prescriptions2013[prescriptions2013$product_name== "fluoxetine", ]
zoloft2013<-prescriptions2013[prescriptions2013$product_name== "sertraline", ]
lexapro2013<-prescriptions2013[prescriptions2013$product_name== "citalopram", ]
wellbutrin2013<-prescriptions2013[prescriptions2013$product_name== "wellbutrin", ]
effexor2013<-prescriptions2013[prescriptions2013$product_name== "effexor xr", ]
# Sedatives/Hypnotics 2013
xanax2013<-prescriptions2013[prescriptions2013$product_name== "alprazolam", ]
klonopin2013<-prescriptions2013[prescriptions2013$product_name== "clonazepam", ]
# Creating a data frame with the medications I need for 2013
prescriptions_2013<- rbind(prozac2013, zoloft2013, lexapro2013, wellbutrin2013, effexor2013, xanax2013, klonopin2013)
glimpse(prescriptions_2013)
## Rows: 3,675
## Columns: 15
## $ utilization_type <chr> "FFSU", "MCOU", "MCOU", "FFSU", "FFSU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 93434656, 93434656, 93435619, 93435619,…
## $ labeler_code <int> 93, 93, 93, 93, 93, 93, 93, 93, 93, 93,…
## $ product_code <int> 4346, 4346, 4356, 4356, 4356, 4356, 610…
## $ package_size <int> 56, 56, 19, 19, 93, 93, 12, 12, 10, 56,…
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 201…
## $ quarter <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used <chr> "false", "false", "true", "true", "fals…
## $ product_name <chr> "fluoxetine", "fluoxetine", "fluoxetine…
## $ units_reimbursed <dbl> 10699.00, 99232.00, NA, NA, 480.00, 673…
## $ number_of_prescriptions <dbl> 307, 2954, NA, NA, 12, 177, 125, 79, NA…
## $ total_amount_reimbursed <dbl> 3320.79, 43195.07, NA, NA, 55.20, 563.0…
## $ medicaid_amount_reimbursed <dbl> 3082.04, 43195.07, NA, NA, 55.20, 563.0…
## $ non_medicaid_amount_reimbursed <dbl> 238.75, 0.00, NA, NA, 0.00, 0.00, 0.00,…
# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions2008<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions2008.csv"))
glimpse(prescriptions2008)
## Rows: 61,253
## Columns: 15
## $ utilization_type <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 2148501, 2300475, 2322730, 2322830, 232…
## $ labeler_code <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code <int> 1485, 3004, 3227, 3228, 3229, 3230, 323…
## $ package_size <int> 1, 75, 30, 30, 30, 30, 30, 30, 30, 30, …
## $ year <int> 2008, 2008, 2008, 2008, 2008, 2008, 200…
## $ quarter <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used <chr> "true", "false", "false", "false", "fal…
## $ product_name <chr> "CAPASTAT S", "PROZAC WEE", "STRATTERA …
## $ units_reimbursed <dbl> NA, 5143.00, 44858.00, 120994.11, 17439…
## $ number_of_prescriptions <dbl> NA, 992, 1050, 2949, 4545, 106, 390, 18…
## $ total_amount_reimbursed <dbl> NA, 137412.76, 193617.64, 522585.23, 81…
## $ medicaid_amount_reimbursed <dbl> NA, 135789.87, 180906.88, 479900.83, 76…
## $ non_medicaid_amount_reimbursed <dbl> NA, 1622.89, 12710.76, 42684.40, 45433.…
prescriptions2008$product_name <-tolower(prescriptions2008$product_name)
# SSRI/SNRIs 2008
prozac2008<-prescriptions2008[prescriptions2008$product_name== "fluoxetine", ]
zoloft2008<-prescriptions2008[prescriptions2008$product_name== "sertraline", ]
lexapro2008<-prescriptions2008[prescriptions2008$product_name== "citalopram", ]
wellbutrin2008<-prescriptions2008[prescriptions2008$product_name== "wellbutrin", ]
effexor2008<-prescriptions2008[prescriptions2008$product_name== "effexor xr", ]
# Sedatives/Hypnotics 2008
xanax2008<-prescriptions2008[prescriptions2008$product_name== "alprazolam", ]
klonopin2008<-prescriptions2008[prescriptions2008$product_name== "clonazepam", ]
# Creating a data frame with the medications I need for 2008
prescriptions_2008<- rbind(prozac2008, zoloft2008, lexapro2008, wellbutrin2008, effexor2008, xanax2008, klonopin2008)
glimpse(prescriptions_2008)
## Rows: 1,881
## Columns: 15
## $ utilization_type <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 93104201, 93104210, 93104301, 93435601,…
## $ labeler_code <int> 93, 93, 93, 93, 93, 93, 93, 93, 121, 17…
## $ product_code <int> 1042, 1042, 1043, 4356, 6108, 7188, 719…
## $ package_size <int> 1, 10, 1, 1, 12, 56, 1, 56, 4, 46, 60, …
## $ year <int> 2008, 2008, 2008, 2008, 2008, 2008, 200…
## $ quarter <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used <chr> "false", "true", "false", "false", "fal…
## $ product_name <chr> "fluoxetine", "fluoxetine", "fluoxetine…
## $ units_reimbursed <dbl> 81960.0, NA, 181085.0, 25597.0, 42417.5…
## $ number_of_prescriptions <dbl> 2015, NA, 4146, 554, 262, 324, 63, 3463…
## $ total_amount_reimbursed <dbl> 19970.15, NA, 46725.58, 6491.12, 7535.6…
## $ medicaid_amount_reimbursed <dbl> 19832.97, NA, 46546.02, 6464.77, 7171.0…
## $ non_medicaid_amount_reimbursed <dbl> 137.18, NA, 179.56, 26.35, 364.68, 69.2…
# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions2003<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions2003.csv"))
glimpse(prescriptions2003)
## Rows: 63,587
## Columns: 15
## $ utilization_type <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 2035102, 2035302, 2035303, 2036302, 203…
## $ labeler_code <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code <int> 351, 353, 353, 363, 363, 803, 817, 819,…
## $ package_size <int> 2, 2, 3, 2, 3, 2, 2, 2, 1, 25, 2, 5, 75…
## $ year <int> 2003, 2003, 2003, 2003, 2003, 2003, 200…
## $ quarter <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used <chr> "false", "false", "true", "false", "fal…
## $ product_name <chr> "DARVOCET N", "DARVON-N ", "DARVON-N …
## $ units_reimbursed <dbl> 7892, 27795, NA, 26905, 2415, 3000, NA,…
## $ number_of_prescriptions <dbl> 132, 309, NA, 263, 28, 25, NA, 15, NA, …
## $ total_amount_reimbursed <dbl> 3402.34, 20719.07, NA, 23183.12, 2102.2…
## $ medicaid_amount_reimbursed <dbl> 0.00, 20719.07, NA, 0.00, 0.00, 1680.81…
## $ non_medicaid_amount_reimbursed <dbl> 0, 0, NA, 0, 0, 0, NA, 0, NA, NA, 0, NA…
prescriptions2003$product_name <-tolower(prescriptions2003$product_name)
# SSRI/SNRIs 2003
prozac2003<-prescriptions2003[prescriptions2003$product_name== "fluoxetine", ]
zoloft2003<-prescriptions2003[prescriptions2003$product_name== "sertraline", ]
lexapro2003<-prescriptions2003[prescriptions2003$product_name== "citalopram", ]
wellbutrin2003<-prescriptions2003[prescriptions2003$product_name== "wellbutrin", ]
effexor2003<-prescriptions2003[prescriptions2003$product_name== "effexor xr", ]
# Sedatives/Hypnotics 2003
xanax2003<-prescriptions2003[prescriptions2003$product_name== "alprazolam", ]
klonopin2003<-prescriptions2003[prescriptions2003$product_name== "clonazepam", ]
# Creating a data frame with the medications I need for 2003
prescriptions_2003<- rbind(prozac2003, zoloft2003, lexapro2003, wellbutrin2003, effexor2003, xanax2003, klonopin2003)
glimpse(prescriptions_2003)
## Rows: 842
## Columns: 15
## $ utilization_type <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 93104201, 93104301, 93610812, 93718810,…
## $ labeler_code <int> 93, 93, 93, 93, 93, 93, 121, 172, 172, …
## $ product_code <int> 1042, 1043, 6108, 7188, 7188, 7198, 721…
## $ package_size <int> 1, 1, 12, 10, 56, 56, 4, 60, 70, 80, 60…
## $ year <int> 2003, 2003, 2003, 2003, 2003, 2003, 200…
## $ quarter <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used <chr> "false", "false", "false", "true", "fal…
## $ product_name <chr> "fluoxetine", "fluoxetine", "fluoxetine…
## $ units_reimbursed <dbl> 42836, 196256, 48455, NA, 5513, 99686, …
## $ number_of_prescriptions <dbl> 1026, 4281, 222, NA, 134, 2737, 33, 110…
## $ total_amount_reimbursed <dbl> 26816.01, 132589.80, 33290.71, NA, 3255…
## $ medicaid_amount_reimbursed <dbl> 0.00, 132589.80, 0.00, NA, 0.00, 0.00, …
## $ non_medicaid_amount_reimbursed <dbl> 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Reading the contents of the csv file using read.csv() function and getting a look at the data
prescriptions1998<-as.data.frame(read.csv("/Users/ursulapodosenin/Desktop/prescriptions1998.csv"))
glimpse(prescriptions1998)
## Rows: 92,938
## Columns: 15
## $ utilization_type <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 2035102, 2035302, 2035303, 2036302, 203…
## $ labeler_code <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ product_code <int> 351, 353, 353, 363, 363, 363, 640, 803,…
## $ package_size <int> 2, 2, 3, 2, 3, 46, 2, 2, 3, 2, 2, 2, 2,…
## $ year <int> 1998, 1998, 1998, 1998, 1998, 1998, 199…
## $ quarter <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ suppression_used <chr> "false", "false", "false", "false", "fa…
## $ product_name <chr> "DARVOCET N", "DARVON-N ", "DARVON-N …
## $ units_reimbursed <dbl> 24259, 21505, 1649, 204234, 46603, 1690…
## $ number_of_prescriptions <dbl> 457, 277, 16, 2662, 571, 18, 97, 119, N…
## $ total_amount_reimbursed <dbl> 8279.44, 11797.01, 848.99, 118588.31, 2…
## $ medicaid_amount_reimbursed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0,…
## $ non_medicaid_amount_reimbursed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0,…
prescriptions1998$product_name <-tolower(prescriptions1998$product_name)
# SSRI/SNRIs 1998
prozac1998<-prescriptions1998[prescriptions1998$product_name== "fluoxetine", ]
zoloft1998<-prescriptions1998[prescriptions1998$product_name== "sertraline", ]
lexapro1998<-prescriptions1998[prescriptions1998$product_name== "citalopram", ]
wellbutrin1998<-prescriptions1998[prescriptions1998$product_name== "wellbutrin", ]
effexor1998<-prescriptions1998[prescriptions1998$product_name== "effexor xr", ]
# Sedatives/Hypnotics 1998
xanax1998<-prescriptions1998[prescriptions1998$product_name== "alprazolam", ]
klonopin1998<-prescriptions1998[prescriptions1998$product_name== "clonazepam", ]
# Creating a data frame with the medications I need for 1998
prescriptions_1998<- rbind(prozac1998, zoloft1998, lexapro1998, wellbutrin1998, effexor1998, xanax1998, klonopin1998)
glimpse(prescriptions_1998)
## Rows: 654
## Columns: 15
## $ utilization_type <chr> "FFSU", "FFSU", "FFSU", "FFSU", "FFSU",…
## $ state <chr> "NY", "NY", "NY", "NY", "NY", "NY", "NY…
## $ ndc <dbl> 81017755, 81017855, 173013555, 17301775…
## $ labeler_code <int> 81, 81, 173, 173, 173, 173, 81, 81, 173…
## $ product_code <int> 177, 178, 135, 177, 178, 947, 177, 178,…
## $ package_size <int> 55, 55, 55, 55, 55, 55, 55, 55, 55, 55,…
## $ year <int> 1998, 1998, 1998, 1998, 1998, 1998, 199…
## $ quarter <int> 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, …
## $ suppression_used <chr> "false", "false", "false", "false", "fa…
## $ product_name <chr> "wellbutrin", "wellbutrin", "wellbutrin…
## $ units_reimbursed <dbl> 23024, 23968, 796913, 259421, 205561, 1…
## $ number_of_prescriptions <dbl> 370, 369, 14898, 3971, 3110, 2846, 445,…
## $ total_amount_reimbursed <dbl> 13962.47, 19177.14, 997179.60, 181304.5…
## $ medicaid_amount_reimbursed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ non_medicaid_amount_reimbursed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Combining prescription data from multiple years into a single data frame
prescriptions_total_years <- bind_rows(prescriptions_2023, prescriptions_2018, prescriptions_2013, prescriptions_2008, prescriptions_2003, prescriptions_1998)
# Creating a clean version of the combined data frame by removing rows with any missing values
prescriptions_total_years_clean <- prescriptions_total_years[complete.cases(prescriptions_total_years),]
# Grouping the cleaned data frame by product name and year, then calculating various summary statistics
prescriptions_total_years_clean|>
group_by(product_name)|>
summarise(total_prescriptions= sum(number_of_prescriptions),
average_prescriptions= mean(number_of_prescriptions),
min_prescriptions= min(number_of_prescriptions),
max_prescriptions= max(number_of_prescriptions),
range_prescriptions= range(number_of_prescriptions),
sd_prescriptions= sd((number_of_prescriptions)
))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'product_name'. You can override using the
## `.groups` argument.
## # A tibble: 38 × 7
## # Groups: product_name [19]
## product_name total_prescriptions average_prescriptions min_prescriptions
## <chr> <dbl> <dbl> <dbl>
## 1 "albendazol" 296 296 296
## 2 "albendazol" 296 296 296
## 3 "alprazolam" 1399134 617. 11
## 4 "alprazolam" 1399134 617. 11
## 5 "caverject " 30 30 30
## 6 "caverject " 30 30 30
## 7 "citalopram" 864949 548. 11
## 8 "citalopram" 864949 548. 11
## 9 "clonazepam" 1733497 958. 11
## 10 "clonazepam" 1733497 958. 11
## # ℹ 28 more rows
## # ℹ 3 more variables: max_prescriptions <dbl>, range_prescriptions <dbl>,
## # sd_prescriptions <dbl>
# Selecting the columns needed for analysis
year_and_number_of_prescriptions_by_name <- prescriptions_total_years_clean[, c("year", "product_name", "number_of_prescriptions")]
glimpse(year_and_number_of_prescriptions_by_name)
## Rows: 9,626
## Columns: 3
## $ year <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023…
## $ product_name <chr> "fluoxetine", "fluoxetine", "fluoxetine", "flu…
## $ number_of_prescriptions <dbl> 36, 2426, 405, 278, 380, 17, 20, 45, 34, 97, 6…
# Grouping the data by product name and year, calculating the total prescriptions, and arranging the data by product name
data_for_predictions <- year_and_number_of_prescriptions_by_name |>
group_by(product_name, year) |>
summarise(total_prescriptions = sum(number_of_prescriptions), .groups = 'drop') |>
arrange(product_name, desc(year))
glimpse(data_for_predictions)
## Rows: 48
## Columns: 3
## $ product_name <chr> "albendazol", "alprazolam", "alprazolam", "alprazo…
## $ year <int> 2023, 2023, 2018, 2013, 2008, 2003, 1998, 2023, 20…
## $ total_prescriptions <dbl> 296, 226100, 338904, 323498, 263901, 160400, 86331…
# Filtering the data frame for the relevant product names
selected_names <- c("alprazolam", "citalopram", "clonazepam", "effexor xr", "fluoxetine", "sertraline", "wellbutrin")
filtered_data <- data_for_predictions[data_for_predictions$product_name %in% selected_names, ]
# Fitting a linear regression model for each product
models <- filtered_data |>
group_by(product_name) |>
do(model = lm(total_prescriptions ~ year, data = .))
# Predicting the total prescriptions for 2030
predictions <- models |>
mutate(predicted_prescriptions_2030 = predict(model, newdata = data.frame(year = 2030)))
# Plotting the trend line and predictions
ggplot(filtered_data, aes(x = year, y = total_prescriptions, color = product_name)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, fullrange = TRUE) +
geom_text(data = predictions, aes(label = round(predicted_prescriptions_2030), x = 2030, y = predicted_prescriptions_2030), vjust = -0.5) +
labs(title = "Total Prescriptions Trend and Prediction for 2030",
x = "Year",
y = "Total Prescriptions") +
theme_minimal() +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
## `geom_smooth()` using formula = 'y ~ x'
# Predicting the total prescriptions for 2030
predictions_2030 <- models |>
mutate(predicted_prescriptions_2030 = predict(model, newdata = data.frame(year = 2030)))
predictions_2030
## # A tibble: 7 × 3
## # Rowwise:
## product_name model predicted_prescriptions_2030
## <chr> <list> <dbl>
## 1 alprazolam <lm> 377372.
## 2 citalopram <lm> 201802.
## 3 clonazepam <lm> 441702.
## 4 effexor xr <lm> -15392.
## 5 fluoxetine <lm> 448136.
## 6 sertraline <lm> 825023.
## 7 wellbutrin <lm> -71754.
# Obtaining different sentiment analysis libraries
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
get_sentiments("nrc")
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
# Organizing the SSRI's/ SNRIs medications into one data frame
Fluoxetine <- drug_reviews[drug_reviews$drugName == "Prozac", ]
Sertraline <- drug_reviews[drug_reviews$drugName == "Zoloft", ]
Wellbutrin <- drug_reviews[drug_reviews$drugName == "Wellbutrin", ]
Citalopram <- drug_reviews[drug_reviews$drugName == "Celexa", ]
Effexor <- drug_reviews[drug_reviews$drugName == "Effexor", ]
ssrisnri<- rbind(Fluoxetine, Sertraline, Wellbutrin, Citalopram, Effexor)
glimpse(ssrisnri)
## Rows: 810
## Columns: 7
## $ ...1 <dbl> 181313, 181495, 181391, 181344, 181657, 181693, 181273, 18…
## $ drugName <chr> "Prozac", "Prozac", "Prozac", "Prozac", "Prozac", "Prozac"…
## $ condition <chr> "Major Depressive Disorde", "Anxiety and Stress", "Anxiety…
## $ review <chr> "\"I lost my sexuality from the first pill, overnight it w…
## $ rating <dbl> 1, 6, 6, 8, 10, 7, 2, 9, 9, 10, 10, 5, 9, 10, 10, 9, 10, 8…
## $ date <chr> "November 17, 2015", "August 8, 2014", "September 13, 2015…
## $ usefulCount <dbl> 24, 51, 38, 54, 193, 40, 35, 160, 115, 167, 50, 27, 102, 1…
# Renaming the prescriptions
ssrisnri$drugName<-ifelse(ssrisnri$drugName== "Prozac", "fluoxetine", ssrisnri$drugName)
ssrisnri$drugName<-ifelse(ssrisnri$drugName== "Zoloft", "sertraline", ssrisnri$drugName)
ssrisnri$drugName<-ifelse(ssrisnri$drugName== "Celexa", "citalopram", ssrisnri$drugName)
ssrisnri$drugName<-ifelse(ssrisnri$drugName== "Effexor", "effexor xr", ssrisnri$drugName)
ssrisnri$drugName<-ifelse(ssrisnri$drugName== "Wellbutrin", "wellburtin", ssrisnri$drugName)
# Performing the sentiment analysis
ssrisnri_sentiment <- ssrisnri |>
dplyr::select(drugName, review) |>
mutate(review = as.character(review)) |>
unnest_tokens(word, review) |>
inner_join(get_sentiments("bing")) |>
count(drugName, sentiment) |>
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) |>
mutate(sentiment_score = positive - negative)
## Joining with `by = join_by(word)`
# Viewing the resulting data frame and finding the average sentiment score
print(ssrisnri_sentiment)
## # A tibble: 5 × 4
## drugName negative positive sentiment_score
## <chr> <int> <int> <int>
## 1 citalopram 899 567 -332
## 2 effexor xr 470 300 -170
## 3 fluoxetine 897 599 -298
## 4 sertraline 1624 978 -646
## 5 wellburtin 571 396 -175
mean(ssrisnri_sentiment$sentiment_score)
## [1] -324.2
# Creating a bar plot of sentiment scores for each drug
ggplot(ssrisnri_sentiment, aes(x = drugName, y = sentiment_score, fill = sentiment_score > 0)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Sentiment Analysis of SSRI and SNRI Medications",
x = "Drug Name",
y = "Sentiment Score",
fill = "Sentiment") +
scale_fill_manual(values = c("red", "green"), labels = c("Negative", "Positive")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
### Based on the sentiment analysis for the SSRI/SNRI drug classes, it
appears that sertraline has the greatest negative sentiment overall.
# Organizing the Sedative/Hypnotic Medications into one data
Alprazolam <- drug_reviews[drug_reviews$drugName == "Xanax", ]
Clonazepam <- drug_reviews[drug_reviews$drugName == "Klonopin", ]
sedativehypnotic<-rbind(Alprazolam, Clonazepam)
sedativehypnotic$drugName <- ifelse(sedativehypnotic$drugName == "Xanax", "alprazolam", sedativehypnotic$drugName)
sedativehypnotic$drugName <- ifelse(sedativehypnotic$drugName == "Klonopin", "clonazepam", sedativehypnotic$drugName)
sedativehypnotic
## # A tibble: 270 × 7
## ...1 drugName condition review rating date usefulCount
## <dbl> <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 7953 alprazolam Anxiety "\"I am a 33 year ol… 9 Apri… 32
## 2 7744 alprazolam Depression "\"The first time I … 10 Nove… 193
## 3 8095 alprazolam Panic Disorde "\"Efficacy never in… 7 Apri… 120
## 4 7598 alprazolam Anxiety "\"I have had PTSD a… 10 Augu… 53
## 5 7897 alprazolam Anxiety "\"Xanax has been a … 10 Janu… 18
## 6 7745 alprazolam Anxiety "\"I take Xanax very… 9 Octo… 52
## 7 7610 alprazolam Depression "\"I suffer from soc… 10 May … 115
## 8 7670 alprazolam Panic Disorde "\"The only medicati… 10 Apri… 118
## 9 7992 alprazolam Anxiety "\"I have been on Xa… 10 June… 24
## 10 7589 alprazolam Anxiety "\"This drug literal… 10 Octo… 46
## # ℹ 260 more rows
# Performing the sentiment analysis
sedativehypnotic_sentiment <- sedativehypnotic |>
dplyr::select(drugName, review) |>
mutate(review = as.character(review)) |>
unnest_tokens(word, review) |>
inner_join(get_sentiments("bing")) |>
count(drugName, sentiment) |>
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) |>
mutate(sentiment_score = positive - negative)
## Joining with `by = join_by(word)`
# Viewing the resulting data frame and finding the average sentiment score
print(sedativehypnotic_sentiment)
## # A tibble: 2 × 4
## drugName negative positive sentiment_score
## <chr> <int> <int> <int>
## 1 alprazolam 736 443 -293
## 2 clonazepam 568 358 -210
mean(sedativehypnotic_sentiment$sentiment_score)
## [1] -251.5
# Creating a bar plot of sentiment scores for each drug
ggplot(sedativehypnotic_sentiment, aes(x = drugName, y = sentiment_score, fill = sentiment_score > 0)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Sentiment Analysis of Sedative and Hypnotic Medications",
x = "Drug Name",
y = "Sentiment Score",
fill = "Sentiment") +
scale_fill_manual(values = c("red", "green"), labels = c("Negative", "Positive")) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
### Based on the sentiment analysis for the Sedative/Hypnotic drug
classes, it appears that alprazolam, also known as Xanax, has the
greatest negative sentiment overall.
# Making a combined table of all sentiment scores
sentiment_scores<-rbind(ssrisnri_sentiment, sedativehypnotic_sentiment)
# Viewing sentiment scores and prescription predictions for 2030
sentiment_scores
## # A tibble: 7 × 4
## drugName negative positive sentiment_score
## <chr> <int> <int> <int>
## 1 citalopram 899 567 -332
## 2 effexor xr 470 300 -170
## 3 fluoxetine 897 599 -298
## 4 sertraline 1624 978 -646
## 5 wellburtin 571 396 -175
## 6 alprazolam 736 443 -293
## 7 clonazepam 568 358 -210
predictions_2030
## # A tibble: 7 × 3
## # Rowwise:
## product_name model predicted_prescriptions_2030
## <chr> <list> <dbl>
## 1 alprazolam <lm> 377372.
## 2 citalopram <lm> 201802.
## 3 clonazepam <lm> 441702.
## 4 effexor xr <lm> -15392.
## 5 fluoxetine <lm> 448136.
## 6 sertraline <lm> 825023.
## 7 wellbutrin <lm> -71754.
names(predictions_2030)[1]<- "drugName"
merged_data <- merge(sentiment_scores, predictions_2030, by = "drugName", all.x = TRUE)
# Plotting the relationship between sentiment scores and predicted values for 2030 for each drug
ggplot(merged_data, aes(x = predicted_prescriptions_2030, y = sentiment_score, color = drugName)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, aes(group = drugName)) +
labs(title = "Relationship between Sentiment Scores and Predicted Values for 2030",
x = "Predicted Total Prescriptions for 2030",
y = "Sentiment Score",
color = "Drug Name") +
theme_classic() +
scale_x_continuous(labels = scales::comma)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).