capstone

library(broom) # used to neatly pass model results (model chaining/iteration)

## Warning: package 'broom' was built under R version 3.2.3

library(forecast) #consider using plot.ly for interactive forecast

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: timeDate
## This is forecast 6.2

library(plyr)
library(dummies)

## dummies-1.5.6 provided by Decision Patterns

library(nortest)
library(glmnet) # http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html

## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2

library(caret)

## Warning: package 'caret' was built under R version 3.2.3

## Loading required package: lattice
## Loading required package: ggplot2

library(Amelia) # missing value viz

## Warning: package 'Amelia' was built under R version 3.2.3

## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2016 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

#
#
# 1.DATASET/QUESTIONS -> 2.FEATURES -> 3.ALGORITHM -> 4.EVALUATION (rinse/repeat) 
#
#

sales <- read.csv("https://www.dropbox.com/s/yxgij2mtsou0lf0/mj-sales-transformed.csv?dl=1")
violations <- read.csv("https://www.dropbox.com/s/euwbe6mciu9oaw9/mj-violations-transformed.csv?dl=1")
applicants <- read.csv("https://www.dropbox.com/s/hw3ci1l8w55wwxw/all_applicants.csv?dl=1")

# for merging, match join column name
applicants$License_Number <- applicants$License.

# what's missing?
missmap(applicants, main = "Applicants: Missing values vs observed")

missmap(sales, main = "Sales: Missing values vs observed")

missmap(violations, main = "Violations: Missing values vs observed")

# format currency fields
# TODO some cells have () (negative) and -
sales$Total_Sales <- as.numeric(gsub("\\$","", gsub("\\,","", sales$Total_Sales)))

## Warning: NAs introduced by coercion

# TODO some cells have () (negative) and -
sales$Excise_Tax_Due <- as.numeric(gsub("\\$","", gsub("\\,","", sales$Excise_Tax_Due)))

## Warning: NAs introduced by coercion

# calculate a tax rate (not always 25%)
sales$tax_rate <- sales$Excise_Tax_Due / sales$Total_Sales

# use applicants and sum sales and violations
# to get: license_number | total_sales | total_violations
# TODO: Do I need to scale features?

# TODO no count??
# aggviolations <- aggregate(violations, by=list(violations$License_Number), FUN=count)

# then merge on license_number
# applicants_with_sales <- merge(applicants, aggsales, by="License_Number")

# use applicants and aggregate sales
sales_only <- sales[,c(2,4,5,8)]
aggsales <- aggregate(sales_only, by=list(sales_only$License_Number), FUN=sum)
# BUG BUG aggsales$License_Number not correct!
# why does the following line put join column in Group.1?
# aggsales[aggsales$Group.1==413287,]
# HACK *replacing* value in License_Number with Group.1
aggsales$License_Number <- aggsales$Group.1

# use applicants and aggregate sales and violations
aggviolations <- count(violations, c('License_Number'))
# NOTE: some businesses use same license to operate different business types (processor, retailer) with diff't tax treatment
# rename count frequency to something meaningful
names(aggviolations)[names(aggviolations)=='freq'] <- "violation_count"

# do a left join to merge aggregate sales and violations to license number
applicants <- merge(applicants, aggsales, by='License_Number', all.x=TRUE) #left join
applicants <- merge(applicants, aggviolations, by='License_Number', all.x=TRUE) #left join

# check the merge!
str(applicants) # Total_Sales and violation_count should be added

## 'data.frame':    5903 obs. of  35 variables:
##  $ License_Number : int  51124 59974 59974 59978 59978 70201 70283 72437 74433 76458 ...
##  $ X              : int  89 2035 1494 1019 725 432 435 1084 1112 577 ...
##  $ City           : Factor w/ 314 levels "                        ",..: 272 254 254 201 201 272 249 249 75 272 ...
##  $ City.1         : Factor w/ 331 levels "","ABERDEEN                ",..: 1 260 260 264 264 1 300 254 261 1 ...
##  $ County         : Factor w/ 38 levels "","ADAMS","ASOTIN",..: 26 22 22 23 23 26 16 16 9 26 ...
##  $ DateCreated    : int  NA 20150903 20150903 20150904 20150904 NA 20140101 20150806 20131221 NA ...
##  $ DayPhone       : num  2.53e+09 3.60e+09 3.60e+09 3.61e+09 3.61e+09 ...
##  $ Email          : Factor w/ 3264 levels "","100terabytes@gmail.com",..: 3218 685 685 3113 3113 2793 494 2395 2216 2782 ...
##  $ License.       : int  51124 59974 59974 59978 59978 70201 70283 72437 74433 76458 ...
##  $ Mail.Suite.Rm  : Factor w/ 88 levels "","                         ",..: 1 2 2 2 2 1 2 2 2 1 ...
##  $ MailAddress    : Factor w/ 3302 levels "0714 NE 72ND AVE APT N59      ",..: 2477 1193 1193 116 116 2355 2765 2757 2293 2900 ...
##  $ MailCity       : Factor w/ 149 levels "","ABERDEEN                ",..: 134 1 1 1 1 134 1 1 1 65 ...
##  $ MailState      : Factor w/ 3 levels "","CA","WA": 3 1 1 1 1 3 1 1 1 3 ...
##  $ MailSuite.Rm   : Factor w/ 18 levels "","                         ",..: 2 1 1 1 1 2 1 1 1 2 ...
##  $ MailZipCode    : num  9.84e+08 NA NA NA NA ...
##  $ NightPhone     : num  2.53e+09 3.60e+09 3.60e+09 3.61e+09 3.61e+09 ...
##  $ OwnerName      : Factor w/ 3164 levels "","\"OH\" MCDONALD FARMS, LLC",..: 411 1399 1399 2970 2970 1781 2181 2144 2500 2601 ...
##  $ PrivDesc       : Factor w/ 6 levels "MARIJUANA PROCESSOR                ",..: 6 2 1 3 1 6 5 5 5 6 ...
##  $ PrivilegeStatus: Factor w/ 5 levels "0CTIVE (ISSUED)",..: 5 5 5 5 5 5 5 4 5 5 ...
##  $ ReasonAction   : Factor w/ 4 levels "APPROVED       ",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ State          : Factor w/ 2 levels "  ","WA": 2 2 2 2 2 2 2 2 2 2 ...
##  $ State.1        : Factor w/ 9 levels "","CA","FO","GA",..: 1 9 9 9 9 1 7 9 9 1 ...
##  $ StatusDate     : int  20151103 NA NA NA NA 20151021 NA NA NA 20151016 ...
##  $ StreetAddress  : Factor w/ 3953 levels "                              ",..: 3467 1535 1535 1633 1633 3264 2722 3866 2680 1651 ...
##  $ Suite.Rm       : Factor w/ 193 levels "                         ",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Tradename      : Factor w/ 3379 levels "'OH' MCDONALD FARMS                          ",..: 424 1538 1538 3183 3183 1900 2316 393 448 2710 ...
##  $ UBI            : num  6.03e+15 6.04e+15 6.04e+15 6.03e+15 6.03e+15 ...
##  $ ZipCode        : num  9.84e+08 9.86e+08 9.86e+08 9.88e+08 9.88e+08 ...
##  $ ZipCode.1      : num  NA 9.86e+08 9.86e+08 9.86e+08 9.86e+08 ...
##  $ type           : Factor w/ 4 levels "medical","processor",..: 1 3 2 3 2 1 4 4 4 1 ...
##  $ Group.1        : int  NA NA NA NA NA NA NA 72437 NA NA ...
##  $ Total_Sales    : num  NA NA NA NA NA ...
##  $ Excise_Tax_Due : num  NA NA NA NA NA ...
##  $ tax_rate       : num  NA NA NA NA NA ...
##  $ violation_count: int  NA NA NA NA NA NA NA NA NA NA ...

# state missing values assumptions:
#  sales - business has not made money or is not reporting, so setting NAs in Total_Sales, Excise_Tax_Due and tax_rate to 0
#  violations - innocence assumed, so setting to 0
applicants$Total_Sales[is.na(applicants$Total_Sales)] <- 0
applicants$Excise_Tax_Due[is.na(applicants$Excise_Tax_Due)] <- 0
applicants$tax_rate[is.na(applicants$tax_rate)] <- 0
applicants$violation_count[is.na(applicants$violation_count)] <- 0

# look at the distribution to test for normality
# plot(density(applicants$Total_Sales)) # Error in plot.new() : figure margins too large
ad.test(applicants$Total_Sales)

## 
##  Anderson-Darling normality test
## 
## data:  applicants$Total_Sales
## A = 1763.3, p-value < 2.2e-16

cvm.test(applicants$Total_Sales)

## Warning in cvm.test(applicants$Total_Sales): p-value is smaller than
## 7.37e-10, cannot be computed more accurately

## 
##  Cramer-von Mises normality test
## 
## data:  applicants$Total_Sales
## W = 377.18, p-value = 7.37e-10

lillie.test(applicants$Total_Sales)

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  applicants$Total_Sales
## D = 0.42591, p-value < 2.2e-16

pearson.test(applicants$Total_Sales)

## 
##  Pearson chi-square normality test
## 
## data:  applicants$Total_Sales
## P = 253360, p-value < 2.2e-16

# sf.test(applicants$Total_Sales) # sample size must be between 5 and 5000
# low p-values, so NOT normally distributed!
# tested with: ad.test(rnorm(100, mean = 5, sd = 3));runif(100, min = 2, max = 4)

# NOT TODO: try to stratify into normal distributions or (assuming non-normal distribution)
# try equivalent tools for non-normal distributions: http://www.isixsigma.com/tools-templates/normality/dealing-non-normal-data-strategies-and-tools/
kruskal.test(list(applicants$violation_count, applicants$Total_Sales))

## 
##  Kruskal-Wallis rank sum test
## 
## data:  list(applicants$violation_count, applicants$Total_Sales)
## Kruskal-Wallis chi-squared = 627.48, df = 1, p-value < 2.2e-16

kruskal.test(list(applicants$Total_Sales, applicants$violation_count)) #same!

## 
##  Kruskal-Wallis rank sum test
## 
## data:  list(applicants$Total_Sales, applicants$violation_count)
## Kruskal-Wallis chi-squared = 627.48, df = 1, p-value < 2.2e-16

# look at the violation_count distribution to test for normality
plot(density(applicants$violation_count))

# shapiro.test(applicants$violation_count) # limited to 5000!?
qqnorm(applicants$violation_count)
qqline(applicants$violation_count, col = 2)

ad.test(applicants$violation_count)

## 
##  Anderson-Darling normality test
## 
## data:  applicants$violation_count
## A = 1904.1, p-value < 2.2e-16

cvm.test(applicants$violation_count)

## Warning in cvm.test(applicants$violation_count): p-value is smaller than
## 7.37e-10, cannot be computed more accurately

## 
##  Cramer-von Mises normality test
## 
## data:  applicants$violation_count
## W = 413.15, p-value = 7.37e-10

lillie.test(applicants$violation_count)

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  applicants$violation_count
## D = 0.51373, p-value < 2.2e-16

pearson.test(applicants$violation_count)

## 
##  Pearson chi-square normality test
## 
## data:  applicants$violation_count
## P = 328860, p-value < 2.2e-16

# sf.test(applicants$violation_count) # limited to 5000!?

# look for anomalies in the data
summary(applicants$ReasonAction) # most applications PENDING, not APPROVED

## APPROVED        DISCONTINUED    PENDING         WITHDRAWN       
##            1215               1            4463             224

summary(applicants$State.1) # 13 out of WA state businesses

##        CA   FO   GA   ID   KS   OK   OR   WA 
##  732    5    1    2    1    1    1    2 5158

summary(applicants$type) # PROCESSOR and RETAILER most prevalent

##   medical processor  producer  retailer 
##       732      1500      2043      1628

# TODO add dummy variables for Suspended/Cancelled/Destruction, since not all violations are severe
#  - join applicants with violations and set dummy variable when violations$Penalty_Type %in% 
#     c('Suspension', 'Cancellation of License', 'Destruction of harvestable plants')

# add dummy variables for each business Type: Producer, Processor, Retailer, Medical
# type.dummies <- dummy('type', applicants, sep=":")
applicants <- dummy.data.frame(names='type', applicants, sep=":")

# TODO add variables for high-risk counties or cities
applicants$violator <- applicants$violation_count > 0
applicants$log_Total_Sales <- log(applicants$Total_Sales)
applicants$log_Excise_Tax_Due <- log(applicants$Excise_Tax_Due)

# is there a correlation between sales and violations? overall? in certain segments?
# overall
summary(lm(applicants$violation_count ~ applicants$Total_Sales)) # low R

## 
## Call:
## lm(formula = applicants$violation_count ~ applicants$Total_Sales)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.0302 -0.1114 -0.1114 -0.1114 11.8886 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.114e-01  9.787e-03   11.38   <2e-16 ***
## applicants$Total_Sales 3.407e-07  1.265e-08   26.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7392 on 5901 degrees of freedom
## Multiple R-squared:  0.1095, Adjusted R-squared:  0.1093 
## F-statistic: 725.3 on 1 and 5901 DF,  p-value: < 2.2e-16

summary(glm(applicants$violation_count ~ applicants$Total_Sales)) # ???

## 
## Call:
## glm(formula = applicants$violation_count ~ applicants$Total_Sales)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.0302  -0.1114  -0.1114  -0.1114  11.8886  
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.114e-01  9.787e-03   11.38   <2e-16 ***
## applicants$Total_Sales 3.407e-07  1.265e-08   26.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.5463647)
## 
##     Null deviance: 3620.4  on 5902  degrees of freedom
## Residual deviance: 3224.1  on 5901  degrees of freedom
## AIC: 13188
## 
## Number of Fisher Scoring iterations: 2

# logit regression for predicting a violator
train <- applicants[1:4130,]
test <- applicants[4131:5903,]
model <- glm(violator ~ Total_Sales + Excise_Tax_Due + PrivDesc,family=binomial(link='logit'),data=train)
summary(model) # significant: sales, tax, retailer type

## 
## Call:
## glm(formula = violator ~ Total_Sales + Excise_Tax_Due + PrivDesc, 
##     family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.2359  -0.3677  -0.3198  -0.2606   2.5846  
## 
## Coefficients:
##                                               Estimate Std. Error z value
## (Intercept)                                 -2.660e+00  1.246e-01 -21.348
## Total_Sales                                  2.541e-06  2.236e-07  11.364
## Excise_Tax_Due                              -5.345e-06  7.916e-07  -6.752
## PrivDescMARIJUANA PRODUCER TIER 1            3.831e-02  2.808e-01   0.136
## PrivDescMARIJUANA PRODUCER TIER 2           -1.305e-02  1.999e-01  -0.065
## PrivDescMARIJUANA PRODUCER TIER 3           -2.873e-01  2.104e-01  -1.365
## PrivDescMARIJUANA RETAILER                  -7.050e-01  1.856e-01  -3.799
## PrivDescMEDICAL MARIJUANA                    4.712e-01  2.507e-01   1.880
##                                             Pr(>|z|)    
## (Intercept)                                  < 2e-16 ***
## Total_Sales                                  < 2e-16 ***
## Excise_Tax_Due                              1.46e-11 ***
## PrivDescMARIJUANA PRODUCER TIER 1           0.891458    
## PrivDescMARIJUANA PRODUCER TIER 2           0.947923    
## PrivDescMARIJUANA PRODUCER TIER 3           0.172271    
## PrivDescMARIJUANA RETAILER                  0.000145 ***
## PrivDescMEDICAL MARIJUANA                   0.060128 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2387.6  on 4129  degrees of freedom
## Residual deviance: 1901.4  on 4122  degrees of freedom
## AIC: 1917.4
## 
## Number of Fisher Scoring iterations: 6

model <- glm(violator ~ Total_Sales + Excise_Tax_Due + ReasonAction,family=binomial(link='logit'),data=train)
summary(model) # significant: pending, withdrawn

## 
## Call:
## glm(formula = violator ~ Total_Sales + Excise_Tax_Due + ReasonAction, 
##     family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1293  -0.2042  -0.1975  -0.1975   2.8095  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -1.635e+00  8.982e-02 -18.206  < 2e-16 ***
## Total_Sales                  2.145e-06  1.913e-07  11.212  < 2e-16 ***
## Excise_Tax_Due              -5.128e-06  6.395e-07  -8.019 1.07e-15 ***
## ReasonActionDISCONTINUED    -1.094e+01  3.247e+02  -0.034    0.973    
## ReasonActionPENDING         -2.292e+00  1.517e-01 -15.109  < 2e-16 ***
## ReasonActionWITHDRAWN       -1.931e+00  3.868e-01  -4.994 5.92e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2387.6  on 4129  degrees of freedom
## Residual deviance: 1661.5  on 4124  degrees of freedom
## AIC: 1673.5
## 
## Number of Fisher Scoring iterations: 11

model <- glm(violator ~ Total_Sales + Excise_Tax_Due + PrivilegeStatus,family=binomial(link='logit'),data=train)
summary(model) # slightly significant: issued

## 
## Call:
## glm(formula = violator ~ Total_Sales + Excise_Tax_Due + PrivilegeStatus, 
##     family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -3.07826  -0.00005  -0.00005  -0.00005   1.83269  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)                         -8.248e-01  2.212e-01  -3.728 0.000193
## Total_Sales                          1.224e-06  1.741e-07   7.030 2.07e-12
## Excise_Tax_Due                      -2.892e-06  5.923e-07  -4.882 1.05e-06
## PrivilegeStatusACTIVE (ISSUED)      -6.482e-01  2.287e-01  -2.834 0.004593
## PrivilegeStatusCLOSED (PERMANENT)   -1.974e+01  1.311e+03  -0.015 0.987983
## PrivilegeStatusPENDING (ISSUED)     -4.212e-01  2.740e-01  -1.537 0.124187
## PrivilegeStatusPENDING (NOT ISSUED) -1.974e+01  3.420e+02  -0.058 0.953965
##                                        
## (Intercept)                         ***
## Total_Sales                         ***
## Excise_Tax_Due                      ***
## PrivilegeStatusACTIVE (ISSUED)      ** 
## PrivilegeStatusCLOSED (PERMANENT)      
## PrivilegeStatusPENDING (ISSUED)        
## PrivilegeStatusPENDING (NOT ISSUED)    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2387.6  on 4129  degrees of freedom
## Residual deviance: 1350.6  on 4123  degrees of freedom
## AIC: 1364.6
## 
## Number of Fisher Scoring iterations: 19

# **** export data to work in Python/sklearn *****
write.csv(applicants, "applicants_transformed.csv")
write.csv(aggsales, "sales_by_license.csv")
write.csv(aggviolations, "violations_by_license.csv")

# use inference() and t-test to infer risk/sales growth from larger population of similar businesses? 
# load(url("http://assets.datacamp.com/course/dasi/inference.Rdata"))
#inference()
#tt=t.test()

capstone_MJ.R

Ed

Wed Jan 20 15:47:23 2016