capstone

library(broom) # used to neatly pass model results (model chaining/iteration)

## Warning: package 'broom' was built under R version 3.2.3

library(forecast) #consider using plot.ly for interactive forecast

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: timeDate
## This is forecast 6.2

library(plyr)
library(dummies)

## dummies-1.5.6 provided by Decision Patterns

sales <- read.csv("https://www.dropbox.com/s/yxgij2mtsou0lf0/mj-sales-transformed.csv?dl=1")
violations <- read.csv("https://www.dropbox.com/s/euwbe6mciu9oaw9/mj-violations-transformed.csv?dl=1")
applicants <- read.csv("https://www.dropbox.com/s/hw3ci1l8w55wwxw/all_applicants.csv?dl=1")

# for merging, match join column name
applicants$License_Number <- applicants$License.

# format currency fields
sales$Total_Sales <- as.numeric(gsub("\\$","", gsub("\\,","", sales$Total_Sales)))

## Warning: NAs introduced by coercion

sales$Excise_Tax_Due <- as.numeric(gsub("\\$","", gsub("\\,","", sales$Excise_Tax_Due)))

## Warning: NAs introduced by coercion

# calculate a tax rate (not always 25%)
sales$tax_rate <- sales$Excise_Tax_Due / sales$Total_Sales

# use applicants and sum sales and violations
# to get: license_number | total_sales | total_violations
# TODO: Do I need to scale features?

# get some aggregates first
salesonly <- sales[,c(2,4)]
aggsales <- aggregate(salesonly, by=list(sales$License_Number), FUN=sum)

# TODO no count??
# aggviolations <- aggregate(violations, by=list(violations$License_Number), FUN=count)

# then merge on license_number
# applicants_with_sales <- merge(applicants, aggsales, by="License_Number")

# use applicants and aggregate sales and violations
sales_only <- sales[,c(2,4)]
aggsales <- aggregate(sales_only, by=list(sales_only$License_Number), FUN="sum")
aggviolations <- count(violations, c('License_Number'))

# rename count frequency to something meaningful
names(aggviolations)[names(aggviolations)=='freq'] <- "violation_count"

# do a left join to merge aggregate sales and violations to license number
applicants <- merge(applicants, aggsales, by='License_Number', all.x=TRUE) #left join
applicants <- merge(applicants, aggviolations, by='License_Number', all.x=TRUE) #left join

# check the merge!
str(applicants) # Total_Sales and violation_count should be added

## 'data.frame':    5903 obs. of  33 variables:
##  $ License_Number : int  51124 59974 59974 59978 59978 70201 70283 72437 74433 76458 ...
##  $ X              : int  89 2035 1494 1019 725 432 435 1084 1112 577 ...
##  $ City           : Factor w/ 314 levels "                        ",..: 272 254 254 201 201 272 249 249 75 272 ...
##  $ City.1         : Factor w/ 331 levels "","ABERDEEN                ",..: 1 260 260 264 264 1 300 254 261 1 ...
##  $ County         : Factor w/ 38 levels "","ADAMS","ASOTIN",..: 26 22 22 23 23 26 16 16 9 26 ...
##  $ DateCreated    : int  NA 20150903 20150903 20150904 20150904 NA 20140101 20150806 20131221 NA ...
##  $ DayPhone       : num  2.53e+09 3.60e+09 3.60e+09 3.61e+09 3.61e+09 ...
##  $ Email          : Factor w/ 3264 levels "","100terabytes@gmail.com",..: 3218 685 685 3113 3113 2793 494 2395 2216 2782 ...
##  $ License.       : int  51124 59974 59974 59978 59978 70201 70283 72437 74433 76458 ...
##  $ Mail.Suite.Rm  : Factor w/ 88 levels "","                         ",..: 1 2 2 2 2 1 2 2 2 1 ...
##  $ MailAddress    : Factor w/ 3302 levels "0714 NE 72ND AVE APT N59      ",..: 2477 1193 1193 116 116 2355 2765 2757 2293 2900 ...
##  $ MailCity       : Factor w/ 149 levels "","ABERDEEN                ",..: 134 1 1 1 1 134 1 1 1 65 ...
##  $ MailState      : Factor w/ 3 levels "","CA","WA": 3 1 1 1 1 3 1 1 1 3 ...
##  $ MailSuite.Rm   : Factor w/ 18 levels "","                         ",..: 2 1 1 1 1 2 1 1 1 2 ...
##  $ MailZipCode    : num  9.84e+08 NA NA NA NA ...
##  $ NightPhone     : num  2.53e+09 3.60e+09 3.60e+09 3.61e+09 3.61e+09 ...
##  $ OwnerName      : Factor w/ 3164 levels "","\"OH\" MCDONALD FARMS, LLC",..: 411 1399 1399 2970 2970 1781 2181 2144 2500 2601 ...
##  $ PrivDesc       : Factor w/ 6 levels "MARIJUANA PROCESSOR                ",..: 6 2 1 3 1 6 5 5 5 6 ...
##  $ PrivilegeStatus: Factor w/ 5 levels "0CTIVE (ISSUED)",..: 5 5 5 5 5 5 5 4 5 5 ...
##  $ ReasonAction   : Factor w/ 4 levels "APPROVED       ",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ State          : Factor w/ 2 levels "  ","WA": 2 2 2 2 2 2 2 2 2 2 ...
##  $ State.1        : Factor w/ 9 levels "","CA","FO","GA",..: 1 9 9 9 9 1 7 9 9 1 ...
##  $ StatusDate     : int  20151103 NA NA NA NA 20151021 NA NA NA 20151016 ...
##  $ StreetAddress  : Factor w/ 3953 levels "                              ",..: 3467 1535 1535 1633 1633 3264 2722 3866 2680 1651 ...
##  $ Suite.Rm       : Factor w/ 193 levels "                         ",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Tradename      : Factor w/ 3379 levels "'OH' MCDONALD FARMS                          ",..: 424 1538 1538 3183 3183 1900 2316 393 448 2710 ...
##  $ UBI            : num  6.03e+15 6.04e+15 6.04e+15 6.03e+15 6.03e+15 ...
##  $ ZipCode        : num  9.84e+08 9.86e+08 9.86e+08 9.88e+08 9.88e+08 ...
##  $ ZipCode.1      : num  NA 9.86e+08 9.86e+08 9.86e+08 9.86e+08 ...
##  $ type           : Factor w/ 4 levels "medical","processor",..: 1 3 2 3 2 1 4 4 4 1 ...
##  $ Group.1        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Total_Sales    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ violation_count: int  NA NA NA NA NA NA NA NA NA NA ...

# is there a correlation between sales and violations? overall? in certain segments?
summary(lm(applicants$violation_count ~ applicants$Total_Sales))

## Warning in summary.lm(lm(applicants$violation_count ~ applicants
## $Total_Sales)): essentially perfect fit: summary may be unreliable

## 
## Call:
## lm(formula = applicants$violation_count ~ applicants$Total_Sales)
## 
## Residuals:
##       1333       1334       3666 
##  1.570e-16 -1.570e-16  2.465e-32 
## 
## Coefficients:
##                         Estimate Std. Error   t value Pr(>|t|)    
## (Intercept)            1.000e+00  2.220e-16 4.503e+15   <2e-16 ***
## applicants$Total_Sales 5.968e-05  1.623e-20 3.677e+15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.22e-16 on 1 degrees of freedom
##   (5900 observations deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.352e+31 on 1 and 1 DF,  p-value: < 2.2e-16

# TODO add dummy variables for Suspended and Cancelled


# TODO add dummary variables for each business Type: Producer, Processor, Retailer, Medical


# **** export data to work in Python/sklearn *****

# TODO use inference() and t-test to infer risk/sales growth from larger population of similar businesses? 
load(url("http://assets.datacamp.com/course/dasi/inference.Rdata"))
#inference()
#tt=t.test(applicants$)

capstone_MJ.R

Ed

Tue Jan 12 13:42:12 2016