library(broom) # used to neatly pass model results (model chaining/iteration)
## Warning: package 'broom' was built under R version 3.2.3
library(forecast) #consider using plot.ly for interactive forecast
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: timeDate
## This is forecast 6.2
library(plyr)
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(nortest)
library(glmnet) #http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2
library(caret)
## Warning: package 'caret' was built under R version 3.2.3
## Loading required package: lattice
## Loading required package: ggplot2
#
#
# 1.DATASET/QUESTIONS -> 2.FEATURES -> 3.ALGORITHM -> 4.EVALUATION (rinse/repeat)
#
#
sales <- read.csv("https://www.dropbox.com/s/yxgij2mtsou0lf0/mj-sales-transformed.csv?dl=1")
violations <- read.csv("https://www.dropbox.com/s/euwbe6mciu9oaw9/mj-violations-transformed.csv?dl=1")
applicants <- read.csv("https://www.dropbox.com/s/hw3ci1l8w55wwxw/all_applicants.csv?dl=1")
# for merging, match join column name
applicants$License_Number <- applicants$License.
# format currency fields
# TODO some cells have () (negative) and -
sales$Total_Sales <- as.numeric(gsub("\\$","", gsub("\\,","", sales$Total_Sales)))
## Warning: NAs introduced by coercion
# TODO some cells have () (negative) and -
sales$Excise_Tax_Due <- as.numeric(gsub("\\$","", gsub("\\,","", sales$Excise_Tax_Due)))
## Warning: NAs introduced by coercion
# calculate a tax rate (not always 25%)
sales$tax_rate <- sales$Excise_Tax_Due / sales$Total_Sales
# use applicants and sum sales and violations
# to get: license_number | total_sales | total_violations
# TODO: Do I need to scale features?
# TODO no count??
# aggviolations <- aggregate(violations, by=list(violations$License_Number), FUN=count)
# then merge on license_number
# applicants_with_sales <- merge(applicants, aggsales, by="License_Number")
# use applicants and aggregate sales
sales_only <- sales[,c(2,4)]
aggsales <- aggregate(sales_only, by=list(sales_only$License_Number), FUN=sum)
# BUG BUG aggsales$License_Number not correct!
# why does the following line put join column in Group.1?
# aggsales[aggsales$Group.1==413287,]
# HACK *replacing* value in License_Number with Group.1
aggsales$License_Number <- aggsales$Group.1
# use applicants and aggregate sales and violations
aggviolations <- count(violations, c('License_Number'))
# NOTE: some businesses use same license to operate different business types (processor, retailer) with diff't tax treatment
# rename count frequency to something meaningful
names(aggviolations)[names(aggviolations)=='freq'] <- "violation_count"
# do a left join to merge aggregate sales and violations to license number
applicants <- merge(applicants, aggsales, by='License_Number', all.x=TRUE) #left join
applicants <- merge(applicants, aggviolations, by='License_Number', all.x=TRUE) #left join
# check the merge!
str(applicants) # Total_Sales and violation_count should be added
## 'data.frame': 5903 obs. of 33 variables:
## $ License_Number : int 51124 59974 59974 59978 59978 70201 70283 72437 74433 76458 ...
## $ X : int 89 2035 1494 1019 725 432 435 1084 1112 577 ...
## $ City : Factor w/ 314 levels " ",..: 272 254 254 201 201 272 249 249 75 272 ...
## $ City.1 : Factor w/ 331 levels "","ABERDEEN ",..: 1 260 260 264 264 1 300 254 261 1 ...
## $ County : Factor w/ 38 levels "","ADAMS","ASOTIN",..: 26 22 22 23 23 26 16 16 9 26 ...
## $ DateCreated : int NA 20150903 20150903 20150904 20150904 NA 20140101 20150806 20131221 NA ...
## $ DayPhone : num 2.53e+09 3.60e+09 3.60e+09 3.61e+09 3.61e+09 ...
## $ Email : Factor w/ 3264 levels "","100terabytes@gmail.com",..: 3218 685 685 3113 3113 2793 494 2395 2216 2782 ...
## $ License. : int 51124 59974 59974 59978 59978 70201 70283 72437 74433 76458 ...
## $ Mail.Suite.Rm : Factor w/ 88 levels ""," ",..: 1 2 2 2 2 1 2 2 2 1 ...
## $ MailAddress : Factor w/ 3302 levels "0714 NE 72ND AVE APT N59 ",..: 2477 1193 1193 116 116 2355 2765 2757 2293 2900 ...
## $ MailCity : Factor w/ 149 levels "","ABERDEEN ",..: 134 1 1 1 1 134 1 1 1 65 ...
## $ MailState : Factor w/ 3 levels "","CA","WA": 3 1 1 1 1 3 1 1 1 3 ...
## $ MailSuite.Rm : Factor w/ 18 levels ""," ",..: 2 1 1 1 1 2 1 1 1 2 ...
## $ MailZipCode : num 9.84e+08 NA NA NA NA ...
## $ NightPhone : num 2.53e+09 3.60e+09 3.60e+09 3.61e+09 3.61e+09 ...
## $ OwnerName : Factor w/ 3164 levels "","\"OH\" MCDONALD FARMS, LLC",..: 411 1399 1399 2970 2970 1781 2181 2144 2500 2601 ...
## $ PrivDesc : Factor w/ 6 levels "MARIJUANA PROCESSOR ",..: 6 2 1 3 1 6 5 5 5 6 ...
## $ PrivilegeStatus: Factor w/ 5 levels "0CTIVE (ISSUED)",..: 5 5 5 5 5 5 5 4 5 5 ...
## $ ReasonAction : Factor w/ 4 levels "APPROVED ",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ State : Factor w/ 2 levels " ","WA": 2 2 2 2 2 2 2 2 2 2 ...
## $ State.1 : Factor w/ 9 levels "","CA","FO","GA",..: 1 9 9 9 9 1 7 9 9 1 ...
## $ StatusDate : int 20151103 NA NA NA NA 20151021 NA NA NA 20151016 ...
## $ StreetAddress : Factor w/ 3953 levels " ",..: 3467 1535 1535 1633 1633 3264 2722 3866 2680 1651 ...
## $ Suite.Rm : Factor w/ 193 levels " ",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Tradename : Factor w/ 3379 levels "'OH' MCDONALD FARMS ",..: 424 1538 1538 3183 3183 1900 2316 393 448 2710 ...
## $ UBI : num 6.03e+15 6.04e+15 6.04e+15 6.03e+15 6.03e+15 ...
## $ ZipCode : num 9.84e+08 9.86e+08 9.86e+08 9.88e+08 9.88e+08 ...
## $ ZipCode.1 : num NA 9.86e+08 9.86e+08 9.86e+08 9.86e+08 ...
## $ type : Factor w/ 4 levels "medical","processor",..: 1 3 2 3 2 1 4 4 4 1 ...
## $ Group.1 : int NA NA NA NA NA NA NA 72437 NA NA ...
## $ Total_Sales : num NA NA NA NA NA ...
## $ violation_count: int NA NA NA NA NA NA NA NA NA NA ...
# missing values assumptions:
# sales - business has not made money or is not reporting, keeping as NA (not making 0)
# violations - innocence assumed, so setting to 0
applicants$violation_count[is.na(applicants$violation_count)] <- 0
# look at the distribution to test for normality
# plot(density(applicants$Total_Sales)) #handle NAs
ad.test(applicants$Total_Sales)
##
## Anderson-Darling normality test
##
## data: applicants$Total_Sales
## A = 212.31, p-value < 2.2e-16
cvm.test(applicants$Total_Sales)
## Warning in cvm.test(applicants$Total_Sales): p-value is smaller than
## 7.37e-10, cannot be computed more accurately
##
## Cramer-von Mises normality test
##
## data: applicants$Total_Sales
## W = 42.699, p-value = 7.37e-10
lillie.test(applicants$Total_Sales)
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: applicants$Total_Sales
## D = 0.33274, p-value < 2.2e-16
pearson.test(applicants$Total_Sales)
##
## Pearson chi-square normality test
##
## data: applicants$Total_Sales
## P = 5535, p-value < 2.2e-16
sf.test(applicants$Total_Sales)
##
## Shapiro-Francia normality test
##
## data: applicants$Total_Sales
## W = 0.43018, p-value < 2.2e-16
# low p-values, so NOT normally distributed!
# tested with: ad.test(rnorm(100, mean = 5, sd = 3));runif(100, min = 2, max = 4)
# TODO: try to stratify into normal distributions or,
# TODO: try equivalent tools for non-normal distributions: http://www.isixsigma.com/tools-templates/normality/dealing-non-normal-data-strategies-and-tools/
kruskal.test(list(applicants$violation_count, applicants$Total_Sales))
##
## Kruskal-Wallis rank sum test
##
## data: list(applicants$violation_count, applicants$Total_Sales)
## Kruskal-Wallis chi-squared = 5694.5, df = 1, p-value < 2.2e-16
kruskal.test(list(applicants$Total_Sales, applicants$violation_count)) #same!
##
## Kruskal-Wallis rank sum test
##
## data: list(applicants$Total_Sales, applicants$violation_count)
## Kruskal-Wallis chi-squared = 5694.5, df = 1, p-value < 2.2e-16
# look at the violation_count distribution to test for normality
plot(density(applicants$violation_count))

# shapiro.test(applicants$violation_count) # limited to 5000!?
qqnorm(applicants$violation_count)
qqline(applicants$violation_count, col = 2)

ad.test(applicants$violation_count)
##
## Anderson-Darling normality test
##
## data: applicants$violation_count
## A = 1904.1, p-value < 2.2e-16
cvm.test(applicants$violation_count)
## Warning in cvm.test(applicants$violation_count): p-value is smaller than
## 7.37e-10, cannot be computed more accurately
##
## Cramer-von Mises normality test
##
## data: applicants$violation_count
## W = 413.15, p-value = 7.37e-10
lillie.test(applicants$violation_count)
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: applicants$violation_count
## D = 0.51373, p-value < 2.2e-16
pearson.test(applicants$violation_count)
##
## Pearson chi-square normality test
##
## data: applicants$violation_count
## P = 328860, p-value < 2.2e-16
# sf.test(applicants$violation_count) # limited to 5000!?
# look for anomalies in the data
summary(applicants$ReasonAction) # most applications PENDING, not APPROVED
## APPROVED DISCONTINUED PENDING WITHDRAWN
## 1215 1 4463 224
summary(applicants$State.1) # 13 out of WA state businesses
## CA FO GA ID KS OK OR WA
## 732 5 1 2 1 1 1 2 5158
summary(applicants$type) # PROCESSOR and RETAILER most prevalent
## medical processor producer retailer
## 732 1500 2043 1628
# is there a correlation between sales and violations? overall? in certain segments?
# overall
summary(lm(applicants$violation_count ~ applicants$Total_Sales))
##
## Call:
## lm(formula = applicants$violation_count ~ applicants$Total_Sales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7512 -0.5760 -0.5261 0.2298 9.0035
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.208e-01 4.323e-02 12.048 <2e-16 ***
## applicants$Total_Sales 2.435e-07 2.585e-08 9.421 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.41 on 1261 degrees of freedom
## (4640 observations deleted due to missingness)
## Multiple R-squared: 0.06576, Adjusted R-squared: 0.06502
## F-statistic: 88.76 on 1 and 1261 DF, p-value: < 2.2e-16
summary(glm(applicants$violation_count ~ applicants$Total_Sales))
##
## Call:
## glm(formula = applicants$violation_count ~ applicants$Total_Sales)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7512 -0.5760 -0.5261 0.2298 9.0035
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.208e-01 4.323e-02 12.048 <2e-16 ***
## applicants$Total_Sales 2.435e-07 2.585e-08 9.421 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.988272)
##
## Null deviance: 2683.7 on 1262 degrees of freedom
## Residual deviance: 2507.2 on 1261 degrees of freedom
## (4640 observations deleted due to missingness)
## AIC: 4456.3
##
## Number of Fisher Scoring iterations: 2
# TODO add dummy variables for Suspended/Cancelled/Destruction, since not all violations are severe
# - join applicants with violations and set dummy variable when violations$Penalty_Type %in%
# c('Suspension', 'Cancellation of License', 'Destruction of harvestable plants')
# add dummy variables for each business Type: Producer, Processor, Retailer, Medical
# type.dummies <- dummy('type', applicants, sep=":")
applicants <- dummy.data.frame(names='type', applicants, sep=":")
# TODO add variables for high-risk counties or cities
# **** export data to work in Python/sklearn *****
write.csv(applicants, "applicants_transformed.csv")
# use inference() and t-test to infer risk/sales growth from larger population of similar businesses?
load(url("http://assets.datacamp.com/course/dasi/inference.Rdata"))
#inference()
#tt=t.test()