Final Project

Abishek, Jin, Kausika and Namrata

12/12/2019

Load data and libraries

library(readxl)
library(tidyverse)
ab_challenge <- read_excel("~/Downloads/AmerisourceBergen Challenge Data Dictionary.xlsx")
dispense_usage_transactions <- read_excel("~/Downloads/Dispense Usage Transactions Top 50 GCN Seq Nbrs Week 201836.xlsx")

2 create a new table

mean_values <- dispense_usage_transactions%>%
  group_by(NDC_DESC) %>%
  summarise(mean = mean(COGS_PRC_MTRC,na.rm = TRUE))

Two ways to verify

any(is.na(mean_values))  
## [1] FALSE
summary(mean_values)
##    NDC_DESC              mean          
##  Length:74          Min.   :   0.1328  
##  Class :character   1st Qu.:   6.8276  
##  Mode  :character   Median :  26.1454  
##                     Mean   : 270.8807  
##                     3rd Qu.:  94.3222  
##                     Max.   :2446.6700

3 Join the table created in question 2 with the data loaded in question 1.

inner<- inner_join(dispense_usage_transactions,mean_values) 

4 From the table created in question 3,replace NA’s in COGS_PRC_MTRC with the mean of COGS_PRC_MTRC.

replace_na <- inner %>%
  mutate(COGS_PRC_MTRC = ifelse(is.na(COGS_PRC_MTRC) == T , mean , COGS_PRC_MTRC))

To verify

any(is.na(replace_na$COGS_PRC_MTRC)) 
## [1] FALSE

5 remove all records where there are negative values in any of the following fields

no_negative <- replace_na %>%
  filter(PURCH_QTY_NDC_1WK_MTRC >= 0, PURCH_QTY_NDC_4WKS_MTRC>=0, PURCH_QTY_NDC_13WKS_MTRC>=0,
         PURCH_QTY_NDC_26WKS_MTRC>=0)

6 Create four leakage variables, one for each time interval. Leakage = dispense - purchase quantities

leakage <- no_negative%>%
  
  mutate(Leakage_1WK = DSPN_QTY_NDC_1WK - PURCH_QTY_NDC_1WK_MTRC,
         Leakage_4WKS = DSPN_QTY_NDC_4WKS - PURCH_QTY_NDC_4WKS_MTRC,
         Leakage_13WKS = DSPN_QTY_NDC_13WKS - PURCH_QTY_NDC_13WKS_MTRC,
         Leakage_26WKS = DSPN_QTY_NDC_26WKS - PURCH_QTY_NDC_26WKS_MTRC)

To have a look at our new variables

leakage[,45:48] 
## # A tibble: 88,471 x 4
##    Leakage_1WK Leakage_4WKS Leakage_13WKS Leakage_26WKS
##          <dbl>        <dbl>         <dbl>         <dbl>
##  1         120        270           900           900  
##  2         240        540          1260          1260  
##  3         105        360          1040          1040  
##  4          40         60           170           170  
##  5           0        381          2326          2326  
##  6           0          0           204           204  
##  7          60        240           420           420  
##  8           6          7.5          16.5          16.5
##  9         180        510          1200          1200  
## 10          60         60           180           180  
## # … with 88,461 more rows

7 Standardize to a per week metric the four leakage variables, note that the 1 week leakage does not need to be standardized.

std_lkg <-leakage%>%
  mutate(Std_Leakage_NDC_1WK=Leakage_1WK,
         Std_Leakage_NDC_4WKS=Leakage_4WKS/4,
         Std_Leakage_NDC_13WKS=Leakage_13WKS/13,
         Std_Leakage_NDC_26WKS=Leakage_26WKS/26 )
std_lkg[,49:52]
## # A tibble: 88,471 x 4
##    Std_Leakage_NDC_1… Std_Leakage_NDC_4W… Std_Leakage_NDC_13… Std_Leakage_NDC_2…
##                 <dbl>               <dbl>               <dbl>              <dbl>
##  1                120               67.5                69.2              34.6  
##  2                240              135                  96.9              48.5  
##  3                105               90                  80                40    
##  4                 40               15                  13.1               6.54 
##  5                  0               95.2               179.               89.5  
##  6                  0                0                  15.7               7.85 
##  7                 60               60                  32.3              16.2  
##  8                  6                1.88                1.27              0.635
##  9                180              128.                 92.3              46.2  
## 10                 60               15                  13.8               6.92 
## # … with 88,461 more rows

8 Calculate the mean per week leakage for each time interval.

mean_lkge <-std_lkg%>%
  mutate(mean_leakage_1WK=mean(Leakage_1WK),
         mean_leakage_4WKS=mean(Leakage_4WKS),
         mean_leakage_13WKS=mean(Leakage_13WKS),
         mean_leakage_26WKS=mean(Leakage_26WKS))

mean_lkge[,53:56]
## # A tibble: 88,471 x 4
##    mean_leakage_1WK mean_leakage_4WKS mean_leakage_13WKS mean_leakage_26WKS
##               <dbl>             <dbl>              <dbl>              <dbl>
##  1            0.140              17.9               49.4               75.8
##  2            0.140              17.9               49.4               75.8
##  3            0.140              17.9               49.4               75.8
##  4            0.140              17.9               49.4               75.8
##  5            0.140              17.9               49.4               75.8
##  6            0.140              17.9               49.4               75.8
##  7            0.140              17.9               49.4               75.8
##  8            0.140              17.9               49.4               75.8
##  9            0.140              17.9               49.4               75.8
## 10            0.140              17.9               49.4               75.8
## # … with 88,461 more rows

9 Does the leakage by time period differ significantly - Use of t.test

x1<-c(mean(mean_lkge$Leakage_1WK),mean(mean_lkge$Leakage_4WKS),mean(mean_lkge$Leakage_13WKS),mean(mean_lkge$Leakage_26WKS))
y1<-c(std_lkg$Std_Leakage_NDC_1WK,std_lkg$Std_Leakage_NDC_4WKS,std_lkg$Std_Leakage_NDC_13WKS,std_lkg$Std_Leakage_NDC_26WKS)
t.test(x1,y1)
## 
##  Welch Two Sample t-test
## 
## data:  x1 and y1
## t = 1.9665, df = 3.0018, p-value = 0.1439
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -20.37098  86.32397
## sample estimates:
## mean of x mean of y 
## 35.811278  2.834783

10. Using 13wk leakage variable, identify the pharmacies most relevant to contain leakages, logic in identifying pharmacies is more important.

ggplot(mean_lkge,aes(x = PHRMCY_NAME, y=Leakage_13WKS))+geom_point()