An attempt to find the optimal number of calls to be made on an account in a month by a sales representative.
#install.packages("date")
#install.packages("zipcode")
#install.packages("ggplot2"")
#install.packages("ggmap")
library(dplyr) #select arrange
## Warning: package 'dplyr' was built under R version 3.1.3
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(date) #as.date
## Warning: package 'date' was built under R version 3.1.3
library(zipcode) #adds leading zipcodes
## Warning: package 'zipcode' was built under R version 3.1.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.1.3
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.1.3
## Loading required package: bitops
#library(plyr) #rename columns
mydata <- NULL
data <- getURL("https://raw.githubusercontent.com/srkhan123/FinalProject_SK/master/skd.csv",
ssl.verifypeer=0L, followlocation=1L)
mydata <- read.csv(text=data)
str(mydata)
## 'data.frame': 22828 obs. of 12 variables:
## $ TG_ID : int 605880 605880 605880 605880 605880 605880 605880 605880 605880 605880 ...
## $ ACCOUNT_NUM: Factor w/ 2847 levels "","001A000000cnTGWIA2",..: 101 101 101 101 101 101 101 101 101 101 ...
## $ NAME : Factor w/ 2741 levels "","A R H Regional Medical Center",..: 1473 1473 1473 1473 1473 1473 1473 1473 1473 1473 ...
## $ ADDRESS : Factor w/ 2798 levels "","1 1st Capitol Dr 300",..: 2782 2782 2782 2782 2782 2782 2782 2782 2782 2782 ...
## $ CITY : Factor w/ 1762 levels "","Abbeville",..: 1031 1031 1031 1031 1031 1031 1031 1031 1031 1031 ...
## $ STATE : Factor w/ 33 levels "","AL","AR","AZ",..: 26 26 26 26 26 26 26 26 26 26 ...
## $ ZIP : int 15063 15063 15063 15063 15063 15063 15063 15063 15063 15063 ...
## $ REGION : Factor w/ 3 levels "","East Region",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ TERR : int 20108 20108 20108 20108 20108 20108 20108 20108 20108 20108 ...
## $ DATE : Factor w/ 37 levels "","1/31/2012",..: 17 26 32 35 8 14 9 22 25 28 ...
## $ UNITS : int 16 0 0 0 0 0 2 0 8 8 ...
## $ CALLS : int 0 4 2 2 2 2 2 2 0 2 ...
head(mydata)
## TG_ID ACCOUNT_NUM NAME ADDRESS CITY STATE
## 1 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## 2 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## 3 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## 4 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## 5 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## 6 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## ZIP REGION TERR DATE UNITS CALLS
## 1 15063 East Region 20108 3/31/2012 16 0
## 2 15063 East Region 20108 6/30/2012 0 4
## 3 15063 East Region 20108 8/31/2012 0 2
## 4 15063 East Region 20108 9/30/2012 0 2
## 5 15063 East Region 20108 11/30/2012 0 2
## 6 15063 East Region 20108 2/28/2013 0 2
mydata$TG_ID <- factor(mydata$TG_ID) #TG_ID should by factor as an identifier
mydata$ZIPCODE <- as.factor(clean.zipcodes(mydata$ZIP)) #clean zipcodes by adding leading 0 and 5 digit long
mydata$ZIP <- NULL #delete old Zip column
head(mydata)
## TG_ID ACCOUNT_NUM NAME ADDRESS CITY STATE
## 1 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## 2 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## 3 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## 4 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## 5 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## 6 605880 15063200 Monongahela Valley Hosp Rd Rt 88 Monongahela PA
## REGION TERR DATE UNITS CALLS ZIPCODE
## 1 East Region 20108 3/31/2012 16 0 15063
## 2 East Region 20108 6/30/2012 0 4 15063
## 3 East Region 20108 8/31/2012 0 2 15063
## 4 East Region 20108 9/30/2012 0 2 15063
## 5 East Region 20108 11/30/2012 0 2 15063
## 6 East Region 20108 2/28/2013 0 2 15063
str(mydata)
## 'data.frame': 22828 obs. of 12 variables:
## $ TG_ID : Factor w/ 2765 levels "605880","605881",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ACCOUNT_NUM: Factor w/ 2847 levels "","001A000000cnTGWIA2",..: 101 101 101 101 101 101 101 101 101 101 ...
## $ NAME : Factor w/ 2741 levels "","A R H Regional Medical Center",..: 1473 1473 1473 1473 1473 1473 1473 1473 1473 1473 ...
## $ ADDRESS : Factor w/ 2798 levels "","1 1st Capitol Dr 300",..: 2782 2782 2782 2782 2782 2782 2782 2782 2782 2782 ...
## $ CITY : Factor w/ 1762 levels "","Abbeville",..: 1031 1031 1031 1031 1031 1031 1031 1031 1031 1031 ...
## $ STATE : Factor w/ 33 levels "","AL","AR","AZ",..: 26 26 26 26 26 26 26 26 26 26 ...
## $ REGION : Factor w/ 3 levels "","East Region",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ TERR : int 20108 20108 20108 20108 20108 20108 20108 20108 20108 20108 ...
## $ DATE : Factor w/ 37 levels "","1/31/2012",..: 17 26 32 35 8 14 9 22 25 28 ...
## $ UNITS : int 16 0 0 0 0 0 2 0 8 8 ...
## $ CALLS : int 0 4 2 2 2 2 2 2 0 2 ...
## $ ZIPCODE : Factor w/ 2567 levels "15009","15025",..: 3 3 3 3 3 3 3 3 3 3 ...
data(zipcode)
mydata <- merge(mydata, zipcode, by.x = 'ZIPCODE', by.y = 'zip', all.x=TRUE)
head(mydata)
## ZIPCODE TG_ID ACCOUNT_NUM NAME ADDRESS
## 1 15009 654194 15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
## 2 15009 654194 15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
## 3 15009 654194 15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
## 4 15009 654194 15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
## 5 15009 654194 15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
## 6 15009 654194 15009200 Heritage Valley Beaver 1000 Dutch Ridge Rd
## CITY STATE REGION TERR DATE UNITS CALLS city state
## 1 Beaver PA East Region 20108 6/30/2013 0 1 Beaver PA
## 2 Beaver PA East Region 20108 3/31/2012 0 1 Beaver PA
## 3 Beaver PA East Region 20108 10/31/2014 0 1 Beaver PA
## 4 Beaver PA East Region 20108 2/28/2013 0 2 Beaver PA
## 5 Beaver PA East Region 20108 12/31/2012 0 1 Beaver PA
## 6 Beaver PA East Region 20108 2/29/2012 0 1 Beaver PA
## latitude longitude
## 1 40.69528 -80.3299
## 2 40.69528 -80.3299
## 3 40.69528 -80.3299
## 4 40.69528 -80.3299
## 5 40.69528 -80.3299
## 6 40.69528 -80.3299
myunits <- mydata %>%
filter(UNITS != 0)
#myunits[ which(myunits$UNITS == 0),]
hist(myunits$UNITS, breaks = 50, main="Order Size Frequency", xlab="UNITS", col="lightgreen", xlim = c(-20,100))
mycalls <- mydata %>%
filter(CALLS != 0)
#mycalls[ which(mycalls$CALLS == 0),]
hist(mycalls$CALLS, breaks = 50, main="Number of Call Frequency", xlab="CALLS", col="lightblue", xlim = c(0,10))
plot(mydata$CALLS, mydata$UNITS , xlab="CALLS" , ylab="UNITS")
cor(myunits$CALLS, myunits$UNITS)
## [1] 0.1797944
cor(mycalls$CALLS, mycalls$UNITS)
## [1] 0.1330366
qqnorm(mydata$UNITS, main = "Normal Q-Q Plot for Units")
qqline(mydata$UNITS)
qqnorm(mydata$CALLS, main = "Normal Q-Q Plot for Calls")
qqline(mydata$CALLS)
data(zipcode)
zipcode$region = substr(zipcode$zip, 1, 1)
g = ggplot(data = zipcode) + geom_point(aes(x=longitude, y=latitude, colour=region))
# simplify display and limit to the "lower 48"
g = g + theme_bw() + scale_x_continuous(limits = c(-125,-66), breaks = NULL)
g = g + scale_y_continuous(limits = c(25,50), breaks = NULL)
# don't need axis labels
g = g + labs(x=NULL, y=NULL)
g
## Warning: Removed 1333 rows containing missing values (geom_point).
g <- NULL
g = ggplot(data = mydata)
g = g + geom_point(aes(x=CALLS, y=UNITS))
#g = g + facet_grid(. ~ REGION)
#g = g + geom_smooth(method = "lm")
g = g + theme_gray()
g
## Warning: Removed 693 rows containing missing values (geom_point).
map <- NULL
map <- get_map(location = 'united states', zoom = 4, maptype = "terrain",
source = 'google', color = 'color')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=united+states&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
ggmap(map)+geom_point(aes (x=longitude, y=latitude, show_guide = TRUE,color = UNITS),
data = myunits, alpha = 0.5, na.rm = T )
map <- NULL
map <- get_map(location = 'united states', zoom =4, maptype = "terrain",
source = 'google', color = 'color')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=united+states&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
ggmap(map)+geom_point(aes (x=longitude, y=latitude, show_guide = TRUE,color = CALLS),
data = mycalls, alpha = 0.5, na.rm = T )
str(mydata)
## 'data.frame': 22828 obs. of 16 variables:
## $ ZIPCODE : Factor w/ 2567 levels "15009","15025",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ TG_ID : Factor w/ 2765 levels "605880","605881",..: 2667 2667 2667 2667 2667 2667 2667 2323 2323 2323 ...
## $ ACCOUNT_NUM: Factor w/ 2847 levels "","001A000000cnTGWIA2",..: 99 99 99 99 99 99 99 100 100 100 ...
## $ NAME : Factor w/ 2741 levels "","A R H Regional Medical Center",..: 940 940 940 940 940 940 940 1056 1056 1056 ...
## $ ADDRESS : Factor w/ 2798 levels "","1 1st Capitol Dr 300",..: 91 91 91 91 91 91 91 2081 2081 2081 ...
## $ CITY : Factor w/ 1762 levels "","Abbeville",..: 107 107 107 107 107 107 107 283 283 283 ...
## $ STATE : Factor w/ 33 levels "","AL","AR","AZ",..: 26 26 26 26 26 26 26 26 26 26 ...
## $ REGION : Factor w/ 3 levels "","East Region",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ TERR : int 20108 20108 20108 20108 20108 20108 20108 20108 20108 20108 ...
## $ DATE : Factor w/ 37 levels "","1/31/2012",..: 27 17 7 14 11 16 23 23 25 33 ...
## $ UNITS : int 0 0 0 0 0 0 0 6 2 0 ...
## $ CALLS : int 1 1 1 2 1 1 1 2 0 2 ...
## $ city : chr "Beaver" "Beaver" "Beaver" "Beaver" ...
## $ state : chr "PA" "PA" "PA" "PA" ...
## $ latitude : num 40.7 40.7 40.7 40.7 40.7 ...
## $ longitude : num -80.3 -80.3 -80.3 -80.3 -80.3 ...
myaccounts <- NULL
myaccounts <- mydata %>%
group_by(ACCOUNT_NUM, NAME, ADDRESS, CITY, STATE, ZIPCODE) %>% #
#select(OUTLET_NUM, NAME, ADDRESS, CITY, STATE, ZIPCODE) %>%
summarise(
TOTAL_UNITS = sum(UNITS),
TOTAL_CALLS = sum(CALLS)
)
#check an account
myaccounts[ which(myaccounts$TOTAL_UNITS >0),]
## Source: local data frame [2,552 x 8]
## Groups: ACCOUNT_NUM, NAME, ADDRESS, CITY, STATE
##
## ACCOUNT_NUM NAME ADDRESS
## 1 10 Madison Memorial Hospital 309 Ne Marion St
## 2 12 Maryvale Hospital Medical Center 5102 W Campbell Ave
## 3 13 Miami Dade County Em Fire Rescure 9300 Nw 41st St
## 4 15025200 Jefferson Regional Medical Center 565 Coal Valley Rd
## 5 15063200 Monongahela Valley Hosp Rd Rt 88
## 6 15065200 Allegheny Valley Hospital 1301 Carlisle St
## 7 15143201 Heritage Valley Sewickley 720 Blackburn Rd
## 8 15146201 Forbes Reg Hlth Ctr-E Sub 2570 Haymaker Rd
## 9 15203250 UPMC Pharmacy Central 3175 E Carson St
## 10 15212200 Allegheny General Hospital 1300 Sandusky St Ste 405
## .. ... ... ...
## Variables not shown: CITY (fctr), STATE (fctr), ZIPCODE (fctr),
## TOTAL_UNITS (int), TOTAL_CALLS (int)
myaccounts[ which(myaccounts$ACCOUNT_NUM == '15143201'),]
## Source: local data frame [1 x 8]
## Groups: ACCOUNT_NUM, NAME, ADDRESS, CITY, STATE
##
## ACCOUNT_NUM NAME ADDRESS CITY STATE
## 1 15143201 Heritage Valley Sewickley 720 Blackburn Rd Sewickley PA
## Variables not shown: ZIPCODE (fctr), TOTAL_UNITS (int), TOTAL_CALLS (int)
mydata[ which(mydata$ACCOUNT_NUM == '15143201'),]
## ZIPCODE TG_ID ACCOUNT_NUM NAME ADDRESS
## 41 15143 652644 15143201 Heritage Valley Sewickley 720 Blackburn Rd
## 42 15143 652644 15143201 Heritage Valley Sewickley 720 Blackburn Rd
## 43 15143 652644 15143201 Heritage Valley Sewickley 720 Blackburn Rd
## CITY STATE REGION TERR DATE UNITS CALLS city
## 41 Sewickley PA East Region 20108 5/31/2012 0 1 Sewickley
## 42 Sewickley PA East Region 20108 4/30/2014 2 0 Sewickley
## 43 Sewickley PA East Region 20108 10/31/2014 0 1 Sewickley
## state latitude longitude
## 41 PA 40.56109 -80.15554
## 42 PA 40.56109 -80.15554
## 43 PA 40.56109 -80.15554
head (myaccounts)
## Source: local data frame [6 x 8]
## Groups: ACCOUNT_NUM, NAME, ADDRESS, CITY, STATE
##
## ACCOUNT_NUM NAME
## 1
## 2 001A000000cnTGWIA2 Moses Cone Hospital Outpatient Pharmacy
## 3 001A000000cnTmtIAE Georgia Poison Center
## 4 001A000000cnToiIAE Ft Leonard Wood Hospital Outpatient
## 5 001A000000cnTS0IAM Sacred Heart Hospital Of Pensacola
## 6 001A000000cnTxqIAE Bon Secours Mary Immaculate
## Variables not shown: ADDRESS (fctr), CITY (fctr), STATE (fctr), ZIPCODE
## (fctr), TOTAL_UNITS (int), TOTAL_CALLS (int)
str(myaccounts)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 2850 obs. of 8 variables:
## $ ACCOUNT_NUM: Factor w/ 2847 levels "","001A000000cnTGWIA2",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ NAME : Factor w/ 2741 levels "","A R H Regional Medical Center",..: 1 1493 800 777 1972 216 1713 1720 2242 2438 ...
## $ ADDRESS : Factor w/ 2798 levels "","1 1st Capitol Dr 300",..: 1 329 2728 469 1996 2516 1673 2795 725 1729 ...
## $ CITY : Factor w/ 1762 levels "","Abbeville",..: 1 619 71 532 1221 992 1194 318 1180 1139 ...
## $ STATE : Factor w/ 33 levels "","AL","AR","AZ",..: 1 19 10 17 9 31 5 27 5 5 ...
## $ ZIPCODE : Factor w/ 2567 levels "15009","15025",..: NA 312 519 1318 652 192 2382 428 2354 2292 ...
## $ TOTAL_UNITS: int NA 0 0 0 0 0 0 0 0 0 ...
## $ TOTAL_CALLS: int NA 62 10 3 26 9 9 8 5 21 ...
## - attr(*, "vars")=List of 5
## ..$ : symbol ACCOUNT_NUM
## ..$ : symbol NAME
## ..$ : symbol ADDRESS
## ..$ : symbol CITY
## ..$ : symbol STATE
## - attr(*, "drop")= logi TRUE
summary(myaccounts)
## ACCOUNT_NUM NAME
## 2.43E+17 : 4 Memorial Hospital : 10
## : 1 Va Medical Center : 7
## 001A000000cnTGWIA2: 1 Good Samaritan Hospital : 4
## 001A000000cnTmtIAE: 1 MULTIPLE DOCTORS : 4
## 001A000000cnToiIAE: 1 Citizens Medical Center : 3
## 001A000000cnTS0IAM: 1 Johnston Memorial Hospital: 3
## (Other) :2841 (Other) :2819
## ADDRESS CITY STATE
## 100 Hospital Dr : 5 HOUSTON : 25 TX : 380
## 101 Hospital Dr : 4 San Antonio : 17 CA : 314
## 1 Medical Center Blvd: 3 Jacksonville: 16 FL : 199
## 200 Hospital Dr : 3 Columbia : 15 GA : 140
## Hwy : 3 Jackson : 13 NC : 127
## 1 1st Capitol Dr 300 : 2 Los Angeles : 13 TN : 120
## (Other) :2830 (Other) :2751 (Other):1570
## ZIPCODE TOTAL_UNITS TOTAL_CALLS
## 77030 : 7 Min. : -12.0 Min. : 0.000
## 72205 : 5 1st Qu.: 6.0 1st Qu.: 0.000
## 76104 : 5 Median : 18.0 Median : 3.000
## 77845 : 5 Mean : 45.4 Mean : 8.617
## 78229 : 5 3rd Qu.: 46.0 3rd Qu.: 11.000
## (Other):2822 Max. :2568.0 Max. :130.000
## NA's : 1 NA's :1 NA's :1
UnitsCount <- NULL
UnitsCount <- mydata %>%
filter(UNITS > 0) %>%
count(ACCOUNT_NUM)
colnames(UnitsCount)[colnames(UnitsCount)=="n"] <- "NumberOfMonthsOrdered"
myaccountUnits <- merge(myaccounts, UnitsCount, by.x = 'ACCOUNT_NUM', by.y = 'ACCOUNT_NUM' , all.x=TRUE)
#check an account
myaccountUnits[ which(myaccountUnits$ACCOUNT_NUM == '15143201'),]
## ACCOUNT_NUM NAME ADDRESS CITY STATE
## 103 15143201 Heritage Valley Sewickley 720 Blackburn Rd Sewickley PA
## ZIPCODE TOTAL_UNITS TOTAL_CALLS NumberOfMonthsOrdered
## 103 15143 2 2 1
#----------------------------------------------------------------------------------------------------
CallsCount <- NULL
CallsCount <- mydata %>%
filter(CALLS > 0) %>%
count(ACCOUNT_NUM)
colnames(CallsCount)[colnames(CallsCount)=="n"] <- "NumberOfMonthsCalled"
#check an account
CallsCount[ which(CallsCount$ACCOUNT_NUM == '15143201'),]
## Source: local data frame [1 x 2]
##
## ACCOUNT_NUM NumberOfMonthsCalled
## 1 15143201 2
myaccounts <- merge(myaccounts, CallsCount, by.x = 'ACCOUNT_NUM', by.y = 'ACCOUNT_NUM' , all.x=TRUE)
#check an account
myaccounts[ which(myaccounts$ACCOUNT_NUM == '15143201'),]
## ACCOUNT_NUM NAME ADDRESS CITY STATE
## 103 15143201 Heritage Valley Sewickley 720 Blackburn Rd Sewickley PA
## ZIPCODE TOTAL_UNITS TOTAL_CALLS NumberOfMonthsCalled
## 103 15143 2 2 2
#-------------------
myaccounts <- merge(myaccounts, myaccountUnits, by.x = 'ACCOUNT_NUM', by.y = 'ACCOUNT_NUM' , all.x=TRUE)
myaccounts[ which(myaccounts$ACCOUNT_NUM == '15143201'),]
## ACCOUNT_NUM NAME.x ADDRESS.x CITY.x
## 103 15143201 Heritage Valley Sewickley 720 Blackburn Rd Sewickley
## STATE.x ZIPCODE.x TOTAL_UNITS.x TOTAL_CALLS.x NumberOfMonthsCalled
## 103 PA 15143 2 2 2
## NAME.y ADDRESS.y CITY.y STATE.y ZIPCODE.y
## 103 Heritage Valley Sewickley 720 Blackburn Rd Sewickley PA 15143
## TOTAL_UNITS.y TOTAL_CALLS.y NumberOfMonthsOrdered
## 103 2 2 1
myaccounts$NAME.y <- NULL
myaccounts$ADDRESS.y <- NULL
myaccounts$CITY.y <- NULL
myaccounts$STATE.y <- NULL
myaccounts$ZIPCODE.y <- NULL
myaccounts$TOTAL_UNITS.y <- NULL
myaccounts$TOTAL_CALLS.y <- NULL
myaccounts[ which(myaccounts$ACCOUNT_NUM == '15143201'),]
## ACCOUNT_NUM NAME.x ADDRESS.x CITY.x
## 103 15143201 Heritage Valley Sewickley 720 Blackburn Rd Sewickley
## STATE.x ZIPCODE.x TOTAL_UNITS.x TOTAL_CALLS.x NumberOfMonthsCalled
## 103 PA 15143 2 2 2
## NumberOfMonthsOrdered
## 103 1
#graph units and calls
plot(myaccounts$TOTAL_UNITS.x~myaccounts$TOTAL_CALLS.x, xlab = "CALLS", ylab = "UNITS")
myaccounts[ which(myaccounts$TOTAL_UNITS.x > 2000),]
## ACCOUNT_NUM NAME.x
## 2594 92354200 Loma Linda University Medical Center And Childrens
## ADDRESS.x CITY.x STATE.x ZIPCODE.x TOTAL_UNITS.x
## 2594 11234 Anderson St Ste A Loma Linda CA 92354 2568
## TOTAL_CALLS.x NumberOfMonthsCalled NumberOfMonthsOrdered
## 2594 77 27 20
myaccounts[ which(myaccounts$NumberOfMonthsOrdered > 25),]
## ACCOUNT_NUM NAME.x ADDRESS.x
## 755 32207200 Baptist Medical Center 800 Prudential Dr Fl 4
## 2091 78229206 University Hospital 4502 Medical Dr
## CITY.x STATE.x ZIPCODE.x TOTAL_UNITS.x TOTAL_CALLS.x
## 755 Jacksonville FL 32207 584 23
## 2091 San Antonio TX 78229 1474 48
## NumberOfMonthsCalled NumberOfMonthsOrdered
## 755 9 27
## 2091 15 29
plot(myaccounts$NumberOfMonthsOrdered~myaccounts$NumberOfMonthsCalled, xlab = "NumberOfMonthsCalled in last 3 years", ylab = "NumberOfMonthsOrdered in last 3 years")
cor(myaccounts$NumberOfMonthsOrdered, myaccounts$NumberOfMonthsCalled)
## [1] NA
levels = myaccounts$TOTAL_UNITS.x
quantile(levels, c(0, .25, .5, .75, 1), na.rm = TRUE)
## 0% 25% 50% 75% 100%
## -12 6 18 46 2568
# 0% 25% 50% 75% 100%
# -12 6 16 42 2568
quantile(myaccounts$TOTAL_UNITS.x , probs = seq(0, 1, 0.25), na.rm = TRUE, names = TRUE)
## 0% 25% 50% 75% 100%
## -12 6 18 46 2568
myQuantiles <- NULL
myQuantiles <- within(myaccounts, quantile <- cut(myaccounts$TOTAL_UNITS.x, quantile(myaccounts$TOTAL_UNITS.x, probs=0.25, 0.5, 0.75), include.lowest=TRUE, labels=FALSE))
myQuantiles[ which(myQuantiles$NumberOfMonthsOrdered > 25),]
## ACCOUNT_NUM NAME.x ADDRESS.x
## 755 32207200 Baptist Medical Center 800 Prudential Dr Fl 4
## 2091 78229206 University Hospital 4502 Medical Dr
## CITY.x STATE.x ZIPCODE.x TOTAL_UNITS.x TOTAL_CALLS.x
## 755 Jacksonville FL 32207 584 23
## 2091 San Antonio TX 78229 1474 48
## NumberOfMonthsCalled NumberOfMonthsOrdered quantile
## 755 9 27 2
## 2091 15 29 4
plot(myQuantiles$quantile~myQuantiles$NumberOfMonthsOrdered, xlab = "NumberOfMonthsOrdered", ylab="Quantiles")
plot(myQuantiles$quantile~myQuantiles$NumberOfMonthsCalled, xlab = "NumberOfMonthsCalled", ylab = "Quantiles")
qqnorm(myQuantiles$NumberOfMonthsOrdered, main = "Normal Q-Q Plot for Units")
qqline(myQuantiles$NumberOfMonthsOrdered)
qqnorm(myQuantiles$NumberOfMonthsCalled, main = "Normal Q-Q Plot for Calls")
qqline(myQuantiles$NumberOfMonthsCalled)
lmResult <- with(myaccounts, lm(NumberOfMonthsOrdered ~ NumberOfMonthsCalled))
summary(lmResult)
##
## Call:
## lm(formula = NumberOfMonthsOrdered ~ NumberOfMonthsCalled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8604 -2.2601 -0.6365 1.3869 21.8456
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.09524 0.12827 24.13 <2e-16 ***
## NumberOfMonthsCalled 0.27061 0.01294 20.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.49 on 1697 degrees of freedom
## (1163 observations deleted due to missingness)
## Multiple R-squared: 0.2048, Adjusted R-squared: 0.2044
## F-statistic: 437.2 on 1 and 1697 DF, p-value: < 2.2e-16
plot(myaccounts$NumberOfMonthsOrdered~myaccounts$NumberOfMonthsCalled, xlab = "NumberOfMonthsCalled in last 3 years", ylab = "NumberOfMonthsOrdered in last 3 years")
abline(lmResult, col="red")
plot(lmResult)