Final Project

Abishek, Jin, Kausika and Namrata

12/12/2019

1 Load data and libraries

library(readxl)
library(tidyverse)
ab_challenge <- read_excel("~/Downloads/AmerisourceBergen Challenge Data Dictionary.xlsx")
dispense_usage_transactions <- read_excel("~/Downloads/Dispense Usage Transactions Top 50 GCN Seq Nbrs Week 201836.xlsx")

2 create a new table

mean_values <- dispense_usage_transactions%>%
  group_by(NDC_DESC) %>%
  summarise(mean = mean(COGS_PRC_MTRC,na.rm = TRUE))

Two ways to verify

any(is.na(mean_values))  
## [1] FALSE
summary(mean_values)
##    NDC_DESC              mean          
##  Length:74          Min.   :   0.1328  
##  Class :character   1st Qu.:   6.8276  
##  Mode  :character   Median :  26.1454  
##                     Mean   : 270.8807  
##                     3rd Qu.:  94.3222  
##                     Max.   :2446.6700

3 Join the table created in question 2 with the data loaded in question 1.

inner<- inner_join(dispense_usage_transactions,mean_values) 

summary(inner)
##     WK_NBR          WK_BEGIN_DTE        WK_END_DTE        NCPDP_TOKEN       
##  Length:89516       Length:89516       Length:89516       Length:89516      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  PHRMCY_NAME        BUY_GRP_NAM            NDC              NDC_DESC        
##  Length:89516       Length:89516       Length:89516       Length:89516      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   BRX_GRX_CD         NDC_QTY_MTRC      NDC_PKG_SZ      NDC_CASE_PK_QTY
##  Length:89516       Min.   :   0.5   Min.   :   0.50   Min.   :1.000  
##  Class :character   1st Qu.:  10.2   1st Qu.:   6.70   1st Qu.:1.000  
##  Mode  :character   Median :  30.0   Median :  30.00   Median :1.000  
##                     Mean   :  85.1   Mean   :  83.13   Mean   :1.723  
##                     3rd Qu.:  60.0   3rd Qu.:  60.00   3rd Qu.:1.000  
##                     Max.   :2000.0   Max.   :2000.00   Max.   :5.000  
##                                                                       
##    MFG_NAM           GCNSEQ_NBR        GCNSEQ_DESC       
##  Length:89516       Length:89516       Length:89516      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  GCNSEQ_RTE_OF_ADMIN_DESC GCNSEQ_DSG_FORM_CD GCNSEQ_STR_DESC   
##  Length:89516             Length:89516       Length:89516      
##  Class :character         Class :character   Class :character  
##  Mode  :character         Mode  :character   Mode  :character  
##                                                                
##                                                                
##                                                                
##                                                                
##  DSPN_QTY_NDC_1WK   DSPN_QTY_NDC_4WKS DSPN_QTY_NDC_13WKS DSPN_QTY_NDC_26WKS
##  Min.   :    0.00   Min.   :    0.0   Min.   :     0.0   Min.   :      0   
##  1st Qu.:    0.00   1st Qu.:    4.0   1st Qu.:    60.0   1st Qu.:    114   
##  Median :    0.00   Median :   60.0   Median :   210.0   Median :    420   
##  Mean   :   65.25   Mean   :  269.2   Mean   :   868.3   Mean   :   1738   
##  3rd Qu.:   45.00   3rd Qu.:  180.0   3rd Qu.:   630.0   3rd Qu.:   1260   
##  Max.   :25116.00   Max.   :54320.0   Max.   :174126.0   Max.   :1001680   
##                                                                            
##  DSPN_QTY_GCN_1WK  DSPN_QTY_GCN_4WKS DSPN_QTY_GCN_13WKS DSPN_QTY_GCN_26WKS 
##  Min.   :    0.0   Min.   :    0.0   Min.   :     0.5   Min.   :      1.0  
##  1st Qu.:    0.0   1st Qu.:   30.0   1st Qu.:   130.0   1st Qu.:    246.9  
##  Median :   30.0   Median :  120.0   Median :   420.0   Median :    810.0  
##  Mean   :  157.9   Mean   :  646.6   Mean   :  2087.4   Mean   :   4173.9  
##  3rd Qu.:  120.0   3rd Qu.:  445.0   3rd Qu.:  1380.0   3rd Qu.:   2715.0  
##  Max.   :49278.0   Max.   :77982.0   Max.   :176926.0   Max.   :1001680.0  
##                                                                            
##  PURCH_QTY_NDC_1WK_MTRC PURCH_QTY_NDC_4WKS_MTRC PURCH_QTY_NDC_13WKS_MTRC
##  Min.   :-1000.00       Min.   :-4000.0         Min.   : -3000.0        
##  1st Qu.:    0.00       1st Qu.:    0.0         1st Qu.:    27.0        
##  Median :    0.00       Median :   30.0         Median :   180.0        
##  Mean   :   64.86       Mean   :  250.8         Mean   :   818.3        
##  3rd Qu.:   30.00       3rd Qu.:  165.0         3rd Qu.:   545.5        
##  Max.   :18000.00       Max.   :55000.0         Max.   :184000.0        
##                                                                         
##  PURCH_QTY_NDC_26WKS_MTRC PURCH_QTY_GCN_1WK_MTRC PURCH_QTY_GCN_4WKS_MTRC
##  Min.   : -8490           Min.   :-1000.0        Min.   :-1500.0        
##  1st Qu.:    80           1st Qu.:    0.0        1st Qu.:   15.0        
##  Median :   360           Median :    6.0        Median :   90.0        
##  Mean   :  1661           Mean   :  166.8        Mean   :  621.6        
##  3rd Qu.:  1170           3rd Qu.:   90.0        3rd Qu.:  390.0        
##  Max.   :381024           Max.   :18000.0        Max.   :56000.0        
##                                                                         
##  PURCH_QTY_GCN_13WKS_MTRC PURCH_QTY_GCN_26WKS_MTRC COGS_PRC_MTRC     
##  Min.   :  -780           Min.   : -8490           Min.   :   0.010  
##  1st Qu.:    99           1st Qu.:   210           1st Qu.:   6.652  
##  Median :   360           Median :   720           Median :  13.375  
##  Mean   :  2016           Mean   :  4092           Mean   :  68.903  
##  3rd Qu.:  1260           3rd Qu.:  2520           3rd Qu.:  29.455  
##  Max.   :186000           Max.   :381024           Max.   :2630.655  
##                                                    NA's   :3746      
##  DSPN_COGS_NDC_1WK  DSPN_COGS_NDC_4WKS DSPN_COGS_NDC_13WKS DSPN_COGS_NDC_26WKS
##  Min.   :       0   Min.   :       0   Min.   :        0   Min.   :        0  
##  1st Qu.:       0   1st Qu.:      45   1st Qu.:      683   1st Qu.:     1278  
##  Median :       0   Median :     822   Median :     2888   Median :     5571  
##  Mean   :    1133   Mean   :    4091   Mean   :    12359   Mean   :    26836  
##  3rd Qu.:     535   3rd Qu.:    2423   3rd Qu.:     7541   3rd Qu.:    14438  
##  Max.   :29614494   Max.   :72299684   Max.   :216899050   Max.   :438814401  
##  NA's   :3746                                                                 
##  DSPN_COGS_GCN_1WK  DSPN_COGS_GCN_4WKS DSPN_COGS_GCN_13WKS DSPN_COGS_GCN_26WKS
##  Min.   :       0   Min.   :       0   Min.   :        1   Min.   :        1  
##  1st Qu.:       0   1st Qu.:     478   1st Qu.:     1956   1st Qu.:     3692  
##  Median :     240   Median :    1549   Median :     4906   Median :     9571  
##  Mean   :    1282   Mean   :    4893   Mean   :    15029   Mean   :    34359  
##  3rd Qu.:     820   3rd Qu.:    3299   3rd Qu.:    10425   3rd Qu.:    20131  
##  Max.   :29614494   Max.   :72299684   Max.   :216899050   Max.   :438814401  
##                                                                               
##       mean          
##  Min.   :   0.1328  
##  1st Qu.:   6.7022  
##  Median :  13.1417  
##  Mean   :  66.9101  
##  3rd Qu.:  29.0646  
##  Max.   :2446.6700  
## 

4 From the table created in question 3,replace NA’s in COGS_PRC_MTRC with the mean of COGS_PRC_MTRC.

replace_na <- inner %>%
  mutate(COGS_PRC_MTRC = ifelse(is.na(COGS_PRC_MTRC) == T , mean , COGS_PRC_MTRC))

To verify

any(is.na(replace_na$COGS_PRC_MTRC)) 
## [1] FALSE

5 remove all records where there are negative values in any of the following fields

no_negative <- replace_na %>%
  filter(PURCH_QTY_NDC_1WK_MTRC >= 0, PURCH_QTY_NDC_4WKS_MTRC>=0, PURCH_QTY_NDC_13WKS_MTRC>=0,
         PURCH_QTY_NDC_26WKS_MTRC>=0)

summary(no_negative)
##     WK_NBR          WK_BEGIN_DTE        WK_END_DTE        NCPDP_TOKEN       
##  Length:88471       Length:88471       Length:88471       Length:88471      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  PHRMCY_NAME        BUY_GRP_NAM            NDC              NDC_DESC        
##  Length:88471       Length:88471       Length:88471       Length:88471      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   BRX_GRX_CD         NDC_QTY_MTRC       NDC_PKG_SZ      NDC_CASE_PK_QTY
##  Length:88471       Min.   :   0.50   Min.   :   0.50   Min.   :1.000  
##  Class :character   1st Qu.:  10.20   1st Qu.:   6.70   1st Qu.:1.000  
##  Mode  :character   Median :  30.00   Median :  30.00   Median :1.000  
##                     Mean   :  85.63   Mean   :  83.66   Mean   :1.727  
##                     3rd Qu.:  60.00   3rd Qu.:  60.00   3rd Qu.:1.000  
##                     Max.   :2000.00   Max.   :2000.00   Max.   :5.000  
##                                                                        
##    MFG_NAM           GCNSEQ_NBR        GCNSEQ_DESC       
##  Length:88471       Length:88471       Length:88471      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  GCNSEQ_RTE_OF_ADMIN_DESC GCNSEQ_DSG_FORM_CD GCNSEQ_STR_DESC   
##  Length:88471             Length:88471       Length:88471      
##  Class :character         Class :character   Class :character  
##  Mode  :character         Mode  :character   Mode  :character  
##                                                                
##                                                                
##                                                                
##                                                                
##  DSPN_QTY_NDC_1WK   DSPN_QTY_NDC_4WKS DSPN_QTY_NDC_13WKS DSPN_QTY_NDC_26WKS
##  Min.   :    0.00   Min.   :    0.0   Min.   :     0.0   Min.   :      0   
##  1st Qu.:    0.00   1st Qu.:    6.0   1st Qu.:    60.0   1st Qu.:    120   
##  Median :    0.00   Median :   60.0   Median :   210.0   Median :    420   
##  Mean   :   65.89   Mean   :  271.8   Mean   :   876.3   Mean   :   1753   
##  3rd Qu.:   50.00   3rd Qu.:  180.0   3rd Qu.:   630.0   3rd Qu.:   1272   
##  Max.   :25116.00   Max.   :54320.0   Max.   :174126.0   Max.   :1001680   
##                                                                            
##  DSPN_QTY_GCN_1WK  DSPN_QTY_GCN_4WKS DSPN_QTY_GCN_13WKS DSPN_QTY_GCN_26WKS
##  Min.   :    0.0   Min.   :    0.0   Min.   :     0.5   Min.   :      1   
##  1st Qu.:    0.0   1st Qu.:   30.0   1st Qu.:   130.0   1st Qu.:    250   
##  Median :   30.0   Median :  120.0   Median :   420.0   Median :    812   
##  Mean   :  158.7   Mean   :  649.7   Mean   :  2096.9   Mean   :   4192   
##  3rd Qu.:  120.0   3rd Qu.:  448.2   3rd Qu.:  1390.0   3rd Qu.:   2730   
##  Max.   :49278.0   Max.   :77982.0   Max.   :176926.0   Max.   :1001680   
##                                                                           
##  PURCH_QTY_NDC_1WK_MTRC PURCH_QTY_NDC_4WKS_MTRC PURCH_QTY_NDC_13WKS_MTRC
##  Min.   :    0.00       Min.   :    0.0         Min.   :     0.0        
##  1st Qu.:    0.00       1st Qu.:    0.0         1st Qu.:    30.0        
##  Median :    0.00       Median :   30.6         Median :   180.0        
##  Mean   :   65.75       Mean   :  253.8         Mean   :   826.9        
##  3rd Qu.:   30.00       3rd Qu.:  170.0         3rd Qu.:   570.0        
##  Max.   :18000.00       Max.   :55000.0         Max.   :184000.0        
##                                                                         
##  PURCH_QTY_NDC_26WKS_MTRC PURCH_QTY_GCN_1WK_MTRC PURCH_QTY_GCN_4WKS_MTRC
##  Min.   :     0.0         Min.   :-1000.0        Min.   :-1500.0        
##  1st Qu.:    90.0         1st Qu.:    0.0        1st Qu.:   18.0        
##  Median :   375.2         Median :    6.0        Median :   91.8        
##  Mean   :  1677.2         Mean   :  167.8        Mean   :  625.3        
##  3rd Qu.:  1200.0         3rd Qu.:   90.0        3rd Qu.:  390.0        
##  Max.   :381024.0         Max.   :18000.0        Max.   :56000.0        
##                                                                         
##  PURCH_QTY_GCN_13WKS_MTRC PURCH_QTY_GCN_26WKS_MTRC COGS_PRC_MTRC      
##  Min.   :  -210           Min.   :  -270           Min.   :   0.0097  
##  1st Qu.:   100           1st Qu.:   210           1st Qu.:   6.6270  
##  Median :   360           Median :   735           Median :  13.3077  
##  Mean   :  2026           Mean   :  4112           Mean   :  66.9086  
##  3rd Qu.:  1260           3rd Qu.:  2532           3rd Qu.:  28.7873  
##  Max.   :186000           Max.   :381024           Max.   :2630.6550  
##                                                                       
##  DSPN_COGS_NDC_1WK  DSPN_COGS_NDC_4WKS DSPN_COGS_NDC_13WKS DSPN_COGS_NDC_26WKS
##  Min.   :       0   Min.   :       0   Min.   :        0   Min.   :        0  
##  1st Qu.:       0   1st Qu.:      54   1st Qu.:      719   1st Qu.:     1323  
##  Median :       0   Median :     831   Median :     2937   Median :     5643  
##  Mean   :    1139   Mean   :    4130   Mean   :    12473   Mean   :    27090  
##  3rd Qu.:     538   3rd Qu.:    2438   3rd Qu.:     7609   3rd Qu.:    14537  
##  Max.   :29614494   Max.   :72299684   Max.   :216899050   Max.   :438814401  
##  NA's   :3368                                                                 
##  DSPN_COGS_GCN_1WK  DSPN_COGS_GCN_4WKS DSPN_COGS_GCN_13WKS DSPN_COGS_GCN_26WKS
##  Min.   :       0   Min.   :       0   Min.   :        1   Min.   :        1  
##  1st Qu.:       0   1st Qu.:     481   1st Qu.:     1958   1st Qu.:     3696  
##  Median :     242   Median :    1554   Median :     4914   Median :     9602  
##  Mean   :    1290   Mean   :    4921   Mean   :    15113   Mean   :    34582  
##  3rd Qu.:     821   3rd Qu.:    3316   3rd Qu.:    10455   3rd Qu.:    20176  
##  Max.   :29614494   Max.   :72299684   Max.   :216899050   Max.   :438814401  
##                                                                               
##       mean          
##  Min.   :   0.1328  
##  1st Qu.:   6.7022  
##  Median :  13.1417  
##  Mean   :  66.9076  
##  3rd Qu.:  29.0646  
##  Max.   :2446.6700  
## 

6 Create four leakage variables, one for each time interval. Leakage = dispense - purchase quantities

leakage <- no_negative%>%
  
  mutate(Leakage_1WK = DSPN_QTY_NDC_1WK - PURCH_QTY_NDC_1WK_MTRC,
         Leakage_4WKS = DSPN_QTY_NDC_4WKS - PURCH_QTY_NDC_4WKS_MTRC,
         Leakage_13WKS = DSPN_QTY_NDC_13WKS - PURCH_QTY_NDC_13WKS_MTRC,
         Leakage_26WKS = DSPN_QTY_NDC_26WKS - PURCH_QTY_NDC_26WKS_MTRC)

To have a look at our new variables

leakage[,45:48] 
## # A tibble: 88,471 x 4
##    Leakage_1WK Leakage_4WKS Leakage_13WKS Leakage_26WKS
##          <dbl>        <dbl>         <dbl>         <dbl>
##  1         120        270           900           900  
##  2         240        540          1260          1260  
##  3         105        360          1040          1040  
##  4          40         60           170           170  
##  5           0        381          2326          2326  
##  6           0          0           204           204  
##  7          60        240           420           420  
##  8           6          7.5          16.5          16.5
##  9         180        510          1200          1200  
## 10          60         60           180           180  
## # … with 88,461 more rows

7 Standardize to a per week metric the four leakage variables, note that the 1 week leakage does not need to be standardized.

std_lkg <-leakage%>%
  mutate(Std_Leakage_NDC_1WK=Leakage_1WK,
         Std_Leakage_NDC_4WKS=Leakage_4WKS/4,
         Std_Leakage_NDC_13WKS=Leakage_13WKS/13,
         Std_Leakage_NDC_26WKS=Leakage_26WKS/26 )
std_lkg[,49:52]
## # A tibble: 88,471 x 4
##    Std_Leakage_NDC_1… Std_Leakage_NDC_4W… Std_Leakage_NDC_13… Std_Leakage_NDC_2…
##                 <dbl>               <dbl>               <dbl>              <dbl>
##  1                120               67.5                69.2              34.6  
##  2                240              135                  96.9              48.5  
##  3                105               90                  80                40    
##  4                 40               15                  13.1               6.54 
##  5                  0               95.2               179.               89.5  
##  6                  0                0                  15.7               7.85 
##  7                 60               60                  32.3              16.2  
##  8                  6                1.88                1.27              0.635
##  9                180              128.                 92.3              46.2  
## 10                 60               15                  13.8               6.92 
## # … with 88,461 more rows

8 Calculate the mean per week leakage for each time interval.

mean_lkge <-std_lkg%>%
  mutate(mean_leakage_1WK=mean(Leakage_1WK),
         mean_leakage_4WKS=mean(Leakage_4WKS),
         mean_leakage_13WKS=mean(Leakage_13WKS),
         mean_leakage_26WKS=mean(Leakage_26WKS))

mean_lkge[,53:56]
## # A tibble: 88,471 x 4
##    mean_leakage_1WK mean_leakage_4WKS mean_leakage_13WKS mean_leakage_26WKS
##               <dbl>             <dbl>              <dbl>              <dbl>
##  1            0.140              17.9               49.4               75.8
##  2            0.140              17.9               49.4               75.8
##  3            0.140              17.9               49.4               75.8
##  4            0.140              17.9               49.4               75.8
##  5            0.140              17.9               49.4               75.8
##  6            0.140              17.9               49.4               75.8
##  7            0.140              17.9               49.4               75.8
##  8            0.140              17.9               49.4               75.8
##  9            0.140              17.9               49.4               75.8
## 10            0.140              17.9               49.4               75.8
## # … with 88,461 more rows

9 Does the leakage by time period differ significantly - Use of t.test

x1<-c(mean(mean_lkge$Leakage_1WK),mean(mean_lkge$Leakage_4WKS),mean(mean_lkge$Leakage_13WKS),mean(mean_lkge$Leakage_26WKS))
y1<-c(std_lkg$Std_Leakage_NDC_1WK,std_lkg$Std_Leakage_NDC_4WKS,std_lkg$Std_Leakage_NDC_13WKS,std_lkg$Std_Leakage_NDC_26WKS)
t.test(x1,y1)
## 
##  Welch Two Sample t-test
## 
## data:  x1 and y1
## t = 1.9665, df = 3.0018, p-value = 0.1439
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -20.37098  86.32397
## sample estimates:
## mean of x mean of y 
## 35.811278  2.834783

10. Using 13wk leakage variable, identify the pharmacies most relevant to contain leakages, logic in identifying pharmacies is more important.

ggplot(mean_lkge) +
geom_histogram(aes(x =Leakage_13WKS),bins = 3)