## Account Type mean min lower middle upper max
## 1: ATM 1.24% 0.53% 0.71% 1.01% 1.47% 4.01%
## 2: Branch 1.65% 0.47% 0.92% 1.2% 1.89% 13.7%
#####Visually, Houses close to a branch appear to have a higher account percentage
##
## Two Sample t-test
##
## data: dt.B$`Percent of Total` and dt.A$`Percent of Total`
## t = 2, df = 100, p-value = 0.05
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 4.05e-05 Inf
## sample estimates:
## mean of x mean of y
## 0.0165 0.0124
##
## Call:
## lm(formula = log(dt$Percent) ~ dt$Footprint)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0213 -0.3965 -0.0651 0.3173 2.3378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.5265 0.0662 -68.41 <2e-16 ***
## dt$Footprint 0.1981 0.0996 1.99 0.049 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.542 on 118 degrees of freedom
## Multiple R-squared: 0.0325, Adjusted R-squared: 0.0243
## F-statistic: 3.96 on 1 and 118 DF, p-value: 0.0489
library(data.table)
require(ggplot2)
library("scales")
nils <- read.csv("C:\\Users\\cbanicki\\Desktop\\MGMT-6330\\case studies\\nilsbaker.csv")
###Formatting of the data
#####Convert into data table
dt <- data.table(nils)
#Rename the dimensions so they are more understandable
levels(dt$FP_DESCR) <- c('Branch', 'ATM')
#Caculate the market penetration ratio for each row
dt$AccountPercent <- dt$HA/dt$THA
colnames(dt) <- c("ID","Total Households","Household w/ Account","Account Type","Footprint", "Percent of Total")
# Capture the standard deviations of each location type by ratio of households with an account
# knowing these can help to determine whether to use a two sample t-test or the Welch two sample t-test
# if both populations don't have the same standard devation then you should use the Welch sample t-test
# although the results should normally be very similar
#Create separate data tables for each location type
dt.B <- dt[ which(`Account Type`=='Branch' ),]
dt.A <- dt[ which(`Account Type`=='ATM' ),]
sdB <- sd(dt.B$`Percent of Total`) #Standard deviation of branch population
mB <- mean(dt.B$`Percent of Total`) #Mean of the ATM population
sdA <- sd(dt.B$`Percent of Total`) #Standard deviation of Ascorbic Acid population
mA <- mean(dt.B$`Percent of Total`) #Mean of the AC population
##learned from here http://blog.yhat.com/posts/fast-summary-statistics-with-data-dot-table.html
dt.bankStats <- dt[,list(
mean=percent(round(mean(`Percent of Total`),4)),
min=percent(round(min(`Percent of Total`),4)),
lower=percent(round(quantile(`Percent of Total`, .25, na.rm=TRUE),4)),
middle=percent(round(quantile(`Percent of Total`, .50, na.rm=TRUE),4)),
upper=percent(round(quantile(`Percent of Total`, .75, na.rm=TRUE),4)),
max=percent(round(max(`Percent of Total`),4))),
by = list(`Account Type`)]
#Display summary stats
dt.bankStats
qqnorm(dt$`Percent of Total`, main='Check for Normal Distribution')
qqline(dt$`Percent of Total`)
ggplot(dt, aes(x = "Account Type", y = `Percent of Total`)) +
geom_bar(aes(fill=`Account Type`), # fill depends on cond2
stat="identity",
color="black", # Black outline for all
position=position_dodge())
if (sdB == sdA) {
t.test(dt.B$`Percent of Total`,dt.A$`Percent of Total`,alternative="greater", var.equal=TRUE) # Given the same standard deviation
} else t.test(dt.A$`Percent of Total`,dt.B$`Percent of Total`,alternative="one.sided") #Does not assume the same standard deviation (Welsh t-test)