An attempt to find the optimal number of calls to be made on an account in a month by a sales representative.

  1. Import data
  2. Scrub data
  3. Prepare data
  4. Explore data
  5. Analyze data
  6. Draw conclusions

Librairies used

#install.packages("date")
#install.packages("zipcode")
#install.packages("ggplot2"")
#install.packages("ggmap")

library(dplyr) #select arrange
## Warning: package 'dplyr' was built under R version 3.1.3
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(date)  #as.date
## Warning: package 'date' was built under R version 3.1.3
library(zipcode) #adds leading zipcodes
## Warning: package 'zipcode' was built under R version 3.1.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.1.3
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.1.3
## Loading required package: bitops
#library(plyr)   #rename columns

Import Data stored on GitHub

mydata <- NULL
data <- getURL("https://raw.githubusercontent.com/srkhan123/FinalProject_SK/master/skd.csv",
               ssl.verifypeer=0L, followlocation=1L)
mydata <- read.csv(text=data)
str(mydata)
## 'data.frame':    22828 obs. of  12 variables:
##  $ TG_ID      : int  605880 605880 605880 605880 605880 605880 605880 605880 605880 605880 ...
##  $ ACCOUNT_NUM: Factor w/ 2847 levels "","001A000000cnTGWIA2",..: 101 101 101 101 101 101 101 101 101 101 ...
##  $ NAME       : Factor w/ 2741 levels "","A R H Regional Medical Center",..: 1473 1473 1473 1473 1473 1473 1473 1473 1473 1473 ...
##  $ ADDRESS    : Factor w/ 2798 levels "","1 1st Capitol Dr 300",..: 2782 2782 2782 2782 2782 2782 2782 2782 2782 2782 ...
##  $ CITY       : Factor w/ 1762 levels "","Abbeville",..: 1031 1031 1031 1031 1031 1031 1031 1031 1031 1031 ...
##  $ STATE      : Factor w/ 33 levels "","AL","AR","AZ",..: 26 26 26 26 26 26 26 26 26 26 ...
##  $ ZIP        : int  15063 15063 15063 15063 15063 15063 15063 15063 15063 15063 ...
##  $ REGION     : Factor w/ 3 levels "","East Region",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ TERR       : int  20108 20108 20108 20108 20108 20108 20108 20108 20108 20108 ...
##  $ DATE       : Factor w/ 37 levels "","1/31/2012",..: 17 26 32 35 8 14 9 22 25 28 ...
##  $ UNITS      : int  16 0 0 0 0 0 2 0 8 8 ...
##  $ CALLS      : int  0 4 2 2 2 2 2 2 0 2 ...
head(mydata)
##    TG_ID ACCOUNT_NUM                    NAME  ADDRESS        CITY STATE
## 1 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
## 2 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
## 3 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
## 4 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
## 5 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
## 6 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
##     ZIP      REGION  TERR       DATE UNITS CALLS
## 1 15063 East Region 20108  3/31/2012    16     0
## 2 15063 East Region 20108  6/30/2012     0     4
## 3 15063 East Region 20108  8/31/2012     0     2
## 4 15063 East Region 20108  9/30/2012     0     2
## 5 15063 East Region 20108 11/30/2012     0     2
## 6 15063 East Region 20108  2/28/2013     0     2

Tidy up Data

mydata$TG_ID <- factor(mydata$TG_ID) #TG_ID should by factor as an identifier
mydata$ZIPCODE <- as.factor(clean.zipcodes(mydata$ZIP)) #clean zipcodes by adding leading 0 and 5 digit long
mydata$ZIP <- NULL #delete old Zip column
head(mydata)
##    TG_ID ACCOUNT_NUM                    NAME  ADDRESS        CITY STATE
## 1 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
## 2 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
## 3 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
## 4 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
## 5 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
## 6 605880    15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela    PA
##        REGION  TERR       DATE UNITS CALLS ZIPCODE
## 1 East Region 20108  3/31/2012    16     0   15063
## 2 East Region 20108  6/30/2012     0     4   15063
## 3 East Region 20108  8/31/2012     0     2   15063
## 4 East Region 20108  9/30/2012     0     2   15063
## 5 East Region 20108 11/30/2012     0     2   15063
## 6 East Region 20108  2/28/2013     0     2   15063
str(mydata)
## 'data.frame':    22828 obs. of  12 variables:
##  $ TG_ID      : Factor w/ 2765 levels "605880","605881",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ACCOUNT_NUM: Factor w/ 2847 levels "","001A000000cnTGWIA2",..: 101 101 101 101 101 101 101 101 101 101 ...
##  $ NAME       : Factor w/ 2741 levels "","A R H Regional Medical Center",..: 1473 1473 1473 1473 1473 1473 1473 1473 1473 1473 ...
##  $ ADDRESS    : Factor w/ 2798 levels "","1 1st Capitol Dr 300",..: 2782 2782 2782 2782 2782 2782 2782 2782 2782 2782 ...
##  $ CITY       : Factor w/ 1762 levels "","Abbeville",..: 1031 1031 1031 1031 1031 1031 1031 1031 1031 1031 ...
##  $ STATE      : Factor w/ 33 levels "","AL","AR","AZ",..: 26 26 26 26 26 26 26 26 26 26 ...
##  $ REGION     : Factor w/ 3 levels "","East Region",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ TERR       : int  20108 20108 20108 20108 20108 20108 20108 20108 20108 20108 ...
##  $ DATE       : Factor w/ 37 levels "","1/31/2012",..: 17 26 32 35 8 14 9 22 25 28 ...
##  $ UNITS      : int  16 0 0 0 0 0 2 0 8 8 ...
##  $ CALLS      : int  0 4 2 2 2 2 2 2 0 2 ...
##  $ ZIPCODE    : Factor w/ 2567 levels "15009","15025",..: 3 3 3 3 3 3 3 3 3 3 ...

Add map coordinates to zipcodes Longitude and Latitude for ggmap

data(zipcode)
mydata <- merge(mydata, zipcode, by.x = 'ZIPCODE', by.y = 'zip', all.x=TRUE)
head(mydata)
##   ZIPCODE  TG_ID ACCOUNT_NUM                   NAME             ADDRESS
## 1   15009 654194    15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
## 2   15009 654194    15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
## 3   15009 654194    15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
## 4   15009 654194    15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
## 5   15009 654194    15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
## 6   15009 654194    15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
##     CITY STATE      REGION  TERR       DATE UNITS CALLS   city state
## 1 Beaver    PA East Region 20108  6/30/2013     0     1 Beaver    PA
## 2 Beaver    PA East Region 20108  3/31/2012     0     1 Beaver    PA
## 3 Beaver    PA East Region 20108 10/31/2014     0     1 Beaver    PA
## 4 Beaver    PA East Region 20108  2/28/2013     0     2 Beaver    PA
## 5 Beaver    PA East Region 20108 12/31/2012     0     1 Beaver    PA
## 6 Beaver    PA East Region 20108  2/29/2012     0     1 Beaver    PA
##   latitude longitude
## 1 40.69528  -80.3299
## 2 40.69528  -80.3299
## 3 40.69528  -80.3299
## 4 40.69528  -80.3299
## 5 40.69528  -80.3299
## 6 40.69528  -80.3299

Histogram of Units

myunits <- mydata %>%
          filter(UNITS != 0)

#myunits[ which(myunits$UNITS == 0),]  

hist(myunits$UNITS, breaks = 50, main="Order Size Frequency", xlab="UNITS", col="lightgreen", xlim = c(-20,100))

Histogram of CAlls

mycalls <- mydata %>%
          filter(CALLS != 0)

#mycalls[ which(mycalls$CALLS == 0),]  

hist(mycalls$CALLS, breaks = 50, main="Number of Call Frequency", xlab="CALLS", col="lightblue", xlim = c(0,10))

Plot and exploring Correlation

plot(mydata$CALLS, mydata$UNITS , xlab="CALLS" , ylab="UNITS")

cor(myunits$CALLS, myunits$UNITS)
## [1] 0.1797944
cor(mycalls$CALLS, mycalls$UNITS)
## [1] 0.1330366

Quantile Plot

qqnorm(mydata$UNITS, main = "Normal Q-Q Plot for Units")
qqline(mydata$UNITS)

qqnorm(mydata$CALLS, main = "Normal Q-Q Plot for Calls")
qqline(mydata$CALLS)

Plotting data for zipcodes from the package using ggplot2

data(zipcode)
zipcode$region = substr(zipcode$zip, 1, 1)
 
g = ggplot(data = zipcode) + geom_point(aes(x=longitude, y=latitude, colour=region))
 
# simplify display and limit to the "lower 48"
g = g + theme_bw() + scale_x_continuous(limits = c(-125,-66), breaks = NULL)
g = g + scale_y_continuous(limits = c(25,50), breaks = NULL)
 
# don't need axis labels
g = g + labs(x=NULL, y=NULL)
g
## Warning: Removed 1333 rows containing missing values (geom_point).

Plotting using mydata and ggplot2

g <- NULL
g = ggplot(data = mydata) 
g = g + geom_point(aes(x=CALLS, y=UNITS))
#g = g + facet_grid(. ~ REGION)
#g = g + geom_smooth(method = "lm")
g = g + theme_gray()
g
## Warning: Removed 693 rows containing missing values (geom_point).

using ggmap with Units

map <- NULL
map <- get_map(location = 'united states', zoom = 4, maptype = "terrain", 
               source = 'google', color = 'color')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=united+states&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
ggmap(map)+geom_point(aes (x=longitude, y=latitude, show_guide = TRUE,color = UNITS), 
                      data = myunits, alpha = 0.5, na.rm = T )

using ggmap with Calls

map <- NULL
map <- get_map(location = 'united states', zoom =4, maptype = "terrain", 
               source = 'google', color = 'color')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=united+states&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
ggmap(map)+geom_point(aes (x=longitude, y=latitude, show_guide = TRUE,color = CALLS), 
                      data = mycalls, alpha = 0.5, na.rm = T )

Normalizing data to show units for each account over 3 years

str(mydata)
## 'data.frame':    22828 obs. of  16 variables:
##  $ ZIPCODE    : Factor w/ 2567 levels "15009","15025",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ TG_ID      : Factor w/ 2765 levels "605880","605881",..: 2667 2667 2667 2667 2667 2667 2667 2323 2323 2323 ...
##  $ ACCOUNT_NUM: Factor w/ 2847 levels "","001A000000cnTGWIA2",..: 99 99 99 99 99 99 99 100 100 100 ...
##  $ NAME       : Factor w/ 2741 levels "","A R H Regional Medical Center",..: 940 940 940 940 940 940 940 1056 1056 1056 ...
##  $ ADDRESS    : Factor w/ 2798 levels "","1 1st Capitol Dr 300",..: 91 91 91 91 91 91 91 2081 2081 2081 ...
##  $ CITY       : Factor w/ 1762 levels "","Abbeville",..: 107 107 107 107 107 107 107 283 283 283 ...
##  $ STATE      : Factor w/ 33 levels "","AL","AR","AZ",..: 26 26 26 26 26 26 26 26 26 26 ...
##  $ REGION     : Factor w/ 3 levels "","East Region",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ TERR       : int  20108 20108 20108 20108 20108 20108 20108 20108 20108 20108 ...
##  $ DATE       : Factor w/ 37 levels "","1/31/2012",..: 27 17 7 14 11 16 23 23 25 33 ...
##  $ UNITS      : int  0 0 0 0 0 0 0 6 2 0 ...
##  $ CALLS      : int  1 1 1 2 1 1 1 2 0 2 ...
##  $ city       : chr  "Beaver" "Beaver" "Beaver" "Beaver" ...
##  $ state      : chr  "PA" "PA" "PA" "PA" ...
##  $ latitude   : num  40.7 40.7 40.7 40.7 40.7 ...
##  $ longitude  : num  -80.3 -80.3 -80.3 -80.3 -80.3 ...
myaccounts <- NULL
myaccounts <- mydata %>%
              group_by(ACCOUNT_NUM, NAME, ADDRESS, CITY, STATE, ZIPCODE) %>% #
              #select(OUTLET_NUM, NAME, ADDRESS, CITY, STATE, ZIPCODE) %>%
              summarise( 
                TOTAL_UNITS = sum(UNITS), 
                TOTAL_CALLS = sum(CALLS)
              ) 


#check an account
myaccounts[ which(myaccounts$TOTAL_UNITS >0),]   
## Source: local data frame [2,552 x 8]
## Groups: ACCOUNT_NUM, NAME, ADDRESS, CITY, STATE
## 
##    ACCOUNT_NUM                              NAME                  ADDRESS
## 1           10         Madison Memorial Hospital         309 Ne Marion St
## 2           12  Maryvale Hospital Medical Center      5102 W Campbell Ave
## 3           13 Miami Dade County Em Fire Rescure          9300 Nw 41st St
## 4     15025200 Jefferson Regional Medical Center       565 Coal Valley Rd
## 5     15063200           Monongahela Valley Hosp                 Rd Rt 88
## 6     15065200         Allegheny Valley Hospital         1301 Carlisle St
## 7     15143201         Heritage Valley Sewickley         720 Blackburn Rd
## 8     15146201         Forbes Reg Hlth Ctr-E Sub         2570 Haymaker Rd
## 9     15203250             UPMC Pharmacy Central         3175 E Carson St
## 10    15212200        Allegheny General Hospital 1300 Sandusky St Ste 405
## ..         ...                               ...                      ...
## Variables not shown: CITY (fctr), STATE (fctr), ZIPCODE (fctr),
##   TOTAL_UNITS (int), TOTAL_CALLS (int)
myaccounts[ which(myaccounts$ACCOUNT_NUM == '15143201'),]   
## Source: local data frame [1 x 8]
## Groups: ACCOUNT_NUM, NAME, ADDRESS, CITY, STATE
## 
##   ACCOUNT_NUM                      NAME          ADDRESS      CITY STATE
## 1    15143201 Heritage Valley Sewickley 720 Blackburn Rd Sewickley    PA
## Variables not shown: ZIPCODE (fctr), TOTAL_UNITS (int), TOTAL_CALLS (int)
mydata[ which(mydata$ACCOUNT_NUM == '15143201'),]   
##    ZIPCODE  TG_ID ACCOUNT_NUM                      NAME          ADDRESS
## 41   15143 652644    15143201 Heritage Valley Sewickley 720 Blackburn Rd
## 42   15143 652644    15143201 Heritage Valley Sewickley 720 Blackburn Rd
## 43   15143 652644    15143201 Heritage Valley Sewickley 720 Blackburn Rd
##         CITY STATE      REGION  TERR       DATE UNITS CALLS      city
## 41 Sewickley    PA East Region 20108  5/31/2012     0     1 Sewickley
## 42 Sewickley    PA East Region 20108  4/30/2014     2     0 Sewickley
## 43 Sewickley    PA East Region 20108 10/31/2014     0     1 Sewickley
##    state latitude longitude
## 41    PA 40.56109 -80.15554
## 42    PA 40.56109 -80.15554
## 43    PA 40.56109 -80.15554
head (myaccounts)
## Source: local data frame [6 x 8]
## Groups: ACCOUNT_NUM, NAME, ADDRESS, CITY, STATE
## 
##          ACCOUNT_NUM                                    NAME
## 1                                                           
## 2 001A000000cnTGWIA2 Moses Cone Hospital Outpatient Pharmacy
## 3 001A000000cnTmtIAE                   Georgia Poison Center
## 4 001A000000cnToiIAE     Ft Leonard Wood Hospital Outpatient
## 5 001A000000cnTS0IAM      Sacred Heart Hospital Of Pensacola
## 6 001A000000cnTxqIAE             Bon Secours Mary Immaculate
## Variables not shown: ADDRESS (fctr), CITY (fctr), STATE (fctr), ZIPCODE
##   (fctr), TOTAL_UNITS (int), TOTAL_CALLS (int)
str(myaccounts)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  2850 obs. of  8 variables:
##  $ ACCOUNT_NUM: Factor w/ 2847 levels "","001A000000cnTGWIA2",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ NAME       : Factor w/ 2741 levels "","A R H Regional Medical Center",..: 1 1493 800 777 1972 216 1713 1720 2242 2438 ...
##  $ ADDRESS    : Factor w/ 2798 levels "","1 1st Capitol Dr 300",..: 1 329 2728 469 1996 2516 1673 2795 725 1729 ...
##  $ CITY       : Factor w/ 1762 levels "","Abbeville",..: 1 619 71 532 1221 992 1194 318 1180 1139 ...
##  $ STATE      : Factor w/ 33 levels "","AL","AR","AZ",..: 1 19 10 17 9 31 5 27 5 5 ...
##  $ ZIPCODE    : Factor w/ 2567 levels "15009","15025",..: NA 312 519 1318 652 192 2382 428 2354 2292 ...
##  $ TOTAL_UNITS: int  NA 0 0 0 0 0 0 0 0 0 ...
##  $ TOTAL_CALLS: int  NA 62 10 3 26 9 9 8 5 21 ...
##  - attr(*, "vars")=List of 5
##   ..$ : symbol ACCOUNT_NUM
##   ..$ : symbol NAME
##   ..$ : symbol ADDRESS
##   ..$ : symbol CITY
##   ..$ : symbol STATE
##  - attr(*, "drop")= logi TRUE
summary(myaccounts)
##              ACCOUNT_NUM                           NAME     
##  2.43E+17          :   4   Memorial Hospital         :  10  
##                    :   1   Va Medical Center         :   7  
##  001A000000cnTGWIA2:   1   Good Samaritan Hospital   :   4  
##  001A000000cnTmtIAE:   1   MULTIPLE DOCTORS          :   4  
##  001A000000cnToiIAE:   1   Citizens Medical Center   :   3  
##  001A000000cnTS0IAM:   1   Johnston Memorial Hospital:   3  
##  (Other)           :2841   (Other)                   :2819  
##                   ADDRESS               CITY          STATE     
##  100 Hospital Dr      :   5   HOUSTON     :  25   TX     : 380  
##  101 Hospital Dr      :   4   San Antonio :  17   CA     : 314  
##  1 Medical Center Blvd:   3   Jacksonville:  16   FL     : 199  
##  200 Hospital Dr      :   3   Columbia    :  15   GA     : 140  
##  Hwy                  :   3   Jackson     :  13   NC     : 127  
##  1 1st Capitol Dr 300 :   2   Los Angeles :  13   TN     : 120  
##  (Other)              :2830   (Other)     :2751   (Other):1570  
##     ZIPCODE      TOTAL_UNITS      TOTAL_CALLS     
##  77030  :   7   Min.   : -12.0   Min.   :  0.000  
##  72205  :   5   1st Qu.:   6.0   1st Qu.:  0.000  
##  76104  :   5   Median :  18.0   Median :  3.000  
##  77845  :   5   Mean   :  45.4   Mean   :  8.617  
##  78229  :   5   3rd Qu.:  46.0   3rd Qu.: 11.000  
##  (Other):2822   Max.   :2568.0   Max.   :130.000  
##  NA's   :   1   NA's   :1        NA's   :1

adding number of counts for units > 0 and Calls > 0

UnitsCount <- NULL
UnitsCount <- mydata %>% 
              filter(UNITS > 0) %>%
              count(ACCOUNT_NUM) 
colnames(UnitsCount)[colnames(UnitsCount)=="n"] <- "NumberOfMonthsOrdered"


myaccountUnits <- merge(myaccounts, UnitsCount, by.x = 'ACCOUNT_NUM', by.y = 'ACCOUNT_NUM' , all.x=TRUE)

        
#check an account
myaccountUnits[ which(myaccountUnits$ACCOUNT_NUM == '15143201'),]   
##     ACCOUNT_NUM                      NAME          ADDRESS      CITY STATE
## 103    15143201 Heritage Valley Sewickley 720 Blackburn Rd Sewickley    PA
##     ZIPCODE TOTAL_UNITS TOTAL_CALLS NumberOfMonthsOrdered
## 103   15143           2           2                     1
#----------------------------------------------------------------------------------------------------

CallsCount <- NULL
CallsCount <- mydata %>% 
              filter(CALLS > 0) %>%
              count(ACCOUNT_NUM) 

colnames(CallsCount)[colnames(CallsCount)=="n"] <- "NumberOfMonthsCalled"

 
#check an account
CallsCount[ which(CallsCount$ACCOUNT_NUM == '15143201'),]
## Source: local data frame [1 x 2]
## 
##   ACCOUNT_NUM NumberOfMonthsCalled
## 1    15143201                    2
myaccounts <- merge(myaccounts, CallsCount, by.x = 'ACCOUNT_NUM', by.y = 'ACCOUNT_NUM' , all.x=TRUE)
          
#check an account
myaccounts[ which(myaccounts$ACCOUNT_NUM == '15143201'),]
##     ACCOUNT_NUM                      NAME          ADDRESS      CITY STATE
## 103    15143201 Heritage Valley Sewickley 720 Blackburn Rd Sewickley    PA
##     ZIPCODE TOTAL_UNITS TOTAL_CALLS NumberOfMonthsCalled
## 103   15143           2           2                    2
#-------------------

myaccounts <- merge(myaccounts, myaccountUnits, by.x = 'ACCOUNT_NUM', by.y = 'ACCOUNT_NUM' , all.x=TRUE)


myaccounts[ which(myaccounts$ACCOUNT_NUM == '15143201'),]
##     ACCOUNT_NUM                    NAME.x        ADDRESS.x    CITY.x
## 103    15143201 Heritage Valley Sewickley 720 Blackburn Rd Sewickley
##     STATE.x ZIPCODE.x TOTAL_UNITS.x TOTAL_CALLS.x NumberOfMonthsCalled
## 103      PA     15143             2             2                    2
##                        NAME.y        ADDRESS.y    CITY.y STATE.y ZIPCODE.y
## 103 Heritage Valley Sewickley 720 Blackburn Rd Sewickley      PA     15143
##     TOTAL_UNITS.y TOTAL_CALLS.y NumberOfMonthsOrdered
## 103             2             2                     1
myaccounts$NAME.y <- NULL  
myaccounts$ADDRESS.y  <- NULL
myaccounts$CITY.y  <- NULL
myaccounts$STATE.y  <- NULL
myaccounts$ZIPCODE.y  <- NULL
myaccounts$TOTAL_UNITS.y  <- NULL
myaccounts$TOTAL_CALLS.y <- NULL


myaccounts[ which(myaccounts$ACCOUNT_NUM == '15143201'),]
##     ACCOUNT_NUM                    NAME.x        ADDRESS.x    CITY.x
## 103    15143201 Heritage Valley Sewickley 720 Blackburn Rd Sewickley
##     STATE.x ZIPCODE.x TOTAL_UNITS.x TOTAL_CALLS.x NumberOfMonthsCalled
## 103      PA     15143             2             2                    2
##     NumberOfMonthsOrdered
## 103                     1

Analyzing data further

#graph units and calls

plot(myaccounts$TOTAL_UNITS.x~myaccounts$TOTAL_CALLS.x, xlab = "CALLS", ylab = "UNITS")

myaccounts[ which(myaccounts$TOTAL_UNITS.x > 2000),]
##      ACCOUNT_NUM                                             NAME.x
## 2594    92354200 Loma Linda University Medical Center And Childrens
##                    ADDRESS.x     CITY.x STATE.x ZIPCODE.x TOTAL_UNITS.x
## 2594 11234 Anderson St Ste A Loma Linda      CA     92354          2568
##      TOTAL_CALLS.x NumberOfMonthsCalled NumberOfMonthsOrdered
## 2594            77                   27                    20
myaccounts[ which(myaccounts$NumberOfMonthsOrdered > 25),]
##      ACCOUNT_NUM                 NAME.x              ADDRESS.x
## 755     32207200 Baptist Medical Center 800 Prudential Dr Fl 4
## 2091    78229206    University Hospital        4502 Medical Dr
##            CITY.x STATE.x ZIPCODE.x TOTAL_UNITS.x TOTAL_CALLS.x
## 755  Jacksonville      FL     32207           584            23
## 2091  San Antonio      TX     78229          1474            48
##      NumberOfMonthsCalled NumberOfMonthsOrdered
## 755                     9                    27
## 2091                   15                    29
plot(myaccounts$NumberOfMonthsOrdered~myaccounts$NumberOfMonthsCalled, xlab = "NumberOfMonthsCalled in last 3 years", ylab = "NumberOfMonthsOrdered in last 3 years")

cor(myaccounts$NumberOfMonthsOrdered, myaccounts$NumberOfMonthsCalled)
## [1] NA

The aboveplot shows that number of optimum calls in 3 years is 20 to get the most optimum level of ordering

which is equal to which gives orders about 50% of the time

Analyzing data further with Quartiles

levels = myaccounts$TOTAL_UNITS.x     
quantile(levels, c(0, .25, .5, .75, 1), na.rm = TRUE) 
##   0%  25%  50%  75% 100% 
##  -12    6   18   46 2568
# 0%  25%  50%  75% 100% 
# -12    6   16   42 2568 

quantile(myaccounts$TOTAL_UNITS.x , probs = seq(0, 1, 0.25), na.rm = TRUE, names = TRUE)
##   0%  25%  50%  75% 100% 
##  -12    6   18   46 2568
myQuantiles <- NULL
myQuantiles <- within(myaccounts, quantile <- cut(myaccounts$TOTAL_UNITS.x, quantile(myaccounts$TOTAL_UNITS.x, probs=0.25, 0.5, 0.75), include.lowest=TRUE, labels=FALSE))

myQuantiles[ which(myQuantiles$NumberOfMonthsOrdered > 25),]
##      ACCOUNT_NUM                 NAME.x              ADDRESS.x
## 755     32207200 Baptist Medical Center 800 Prudential Dr Fl 4
## 2091    78229206    University Hospital        4502 Medical Dr
##            CITY.x STATE.x ZIPCODE.x TOTAL_UNITS.x TOTAL_CALLS.x
## 755  Jacksonville      FL     32207           584            23
## 2091  San Antonio      TX     78229          1474            48
##      NumberOfMonthsCalled NumberOfMonthsOrdered quantile
## 755                     9                    27        2
## 2091                   15                    29        4
plot(myQuantiles$quantile~myQuantiles$NumberOfMonthsOrdered, xlab = "NumberOfMonthsOrdered", ylab="Quantiles")

plot(myQuantiles$quantile~myQuantiles$NumberOfMonthsCalled, xlab = "NumberOfMonthsCalled", ylab = "Quantiles")

Quantile Plot

qqnorm(myQuantiles$NumberOfMonthsOrdered, main = "Normal Q-Q Plot for Units")
qqline(myQuantiles$NumberOfMonthsOrdered)

qqnorm(myQuantiles$NumberOfMonthsCalled, main = "Normal Q-Q Plot for Calls")
qqline(myQuantiles$NumberOfMonthsCalled)

Linear Regression

lmResult <- with(myaccounts, lm(NumberOfMonthsOrdered ~ NumberOfMonthsCalled))

summary(lmResult)
## 
## Call:
## lm(formula = NumberOfMonthsOrdered ~ NumberOfMonthsCalled)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8604 -2.2601 -0.6365  1.3869 21.8456 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.09524    0.12827   24.13   <2e-16 ***
## NumberOfMonthsCalled  0.27061    0.01294   20.91   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.49 on 1697 degrees of freedom
##   (1163 observations deleted due to missingness)
## Multiple R-squared:  0.2048, Adjusted R-squared:  0.2044 
## F-statistic: 437.2 on 1 and 1697 DF,  p-value: < 2.2e-16
plot(myaccounts$NumberOfMonthsOrdered~myaccounts$NumberOfMonthsCalled, xlab = "NumberOfMonthsCalled in last 3 years", ylab = "NumberOfMonthsOrdered in last 3 years")
abline(lmResult, col="red")

plot(lmResult)

Conclusion

the Plot data shows with abline that you need to make about 2 calls to get an order on an account