Set up libraries and read in the data.
library(foreign)
library(MASS)
library(fitdistrplus)
## Loading required package: survival Loading required package: splines
library(lattice)
library(poweRlaw)
library(truncnorm)
options(warn = -1)
setwd("~/Dropbox/wrk/promotion")
spssdata <- data.frame(read.spss("EE Promo Leader Ratings.sav", use.value.labels = TRUE))
Take an initial look as how the data were read in.
str(spssdata)
## 'data.frame': 2625 obs. of 88 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ OFFICE : Factor w/ 298 levels " ","A ",..: 245 294 4 261 1 239 237 248 237 97 ...
## $ OFFICE2 : Factor w/ 13 levels " ","A","B","C",..: 11 13 2 11 1 11 11 11 11 8 ...
## $ JOBCODE : Factor w/ 396 levels " ","0001R1",..: 286 22 109 257 1 195 199 202 199 290 ...
## $ WR : Factor w/ 160 levels " ","0001",..: 104 9 39 93 1 68 69 70 69 105 ...
## $ BAND : num 2 5 4 2 NA 3 4 3 4 4 ...
## $ SUPLEV : Factor w/ 16 levels " ","0 ",..: 10 16 10 10 1 10 10 10 10 10 ...
## $ SUPLEV2 : Factor w/ 2 levels "Non-Supervisor",..: 1 2 1 1 1 1 1 1 1 1 ...
## $ SUPLEV3 : Factor w/ 2 levels "Non Super","Team Lead, Super, or Chief": 1 2 1 1 1 1 1 1 1 1 ...
## $ WRTITLE : Factor w/ 232 levels " ",..: 146 25 31 77 1 133 23 59 23 47 ...
## $ CITY : Factor w/ 56 levels " ",..: 55 6 44 7 1 6 6 48 6 6 ...
## $ STATE : Factor w/ 27 levels " ","AUS","AZ ",..: 7 17 27 7 1 17 17 18 17 17 ...
## $ ETHNIC : Factor w/ 8 levels " ","A","B","C",..: 4 6 6 6 1 6 6 6 6 4 ...
## $ ETHNIC2 : Factor w/ 2 levels "Black","White": 1 2 2 2 NA 2 2 2 2 1 ...
## $ SEX : Factor w/ 3 levels " ","F","M": 2 3 3 3 1 3 2 3 3 3 ...
## $ SEX2 : Factor w/ 2 levels "Female","Male": 1 2 2 2 NA 2 1 2 2 2 ...
## $ BIRTH : num 1.15e+10 1.17e+10 1.22e+10 1.19e+10 NA ...
## $ RETIRE : num 1.32e+10 1.35e+10 1.40e+10 1.37e+10 NA ...
## $ S_CM : Factor w/ 2 levels " ","CIV": 2 2 2 2 1 2 2 2 2 2 ...
## $ SID : Factor w/ 380 levels " ","S100",..: 97 218 45 99 1 19 355 228 355 313 ...
## $ SID2 : num 198 317 141 20 NA 116 73 327 73 412 ...
## $ SSTATE : Factor w/ 14 levels " ","AZ ","CA ",..: 4 9 14 4 1 8 8 9 8 8 ...
## $ SCITY : Factor w/ 27 levels " ",..: 26 23 21 5 1 4 4 23 4 4 ...
## $ SETHNIC : Factor w/ 7 levels " ","A","B","C",..: 6 6 6 6 1 6 6 6 6 6 ...
## $ SETHNIC2: Factor w/ 2 levels "Black","White": 2 2 2 2 NA 2 2 2 2 2 ...
## $ SSEX : Factor w/ 3 levels " ","F","M": 3 2 2 3 1 3 3 3 3 2 ...
## $ SSEX2 : Factor w/ 2 levels "Female","Male": 2 1 1 2 NA 2 2 2 2 1 ...
## $ SBDATE : num 1.21e+10 1.17e+10 1.18e+10 1.19e+10 NA ...
## $ SRETIRE : num 1.38e+10 1.35e+10 1.35e+10 1.37e+10 NA ...
## $ R_CM : Factor w/ 3 levels " ","CIV","MIL": 2 2 2 3 1 2 2 2 2 2 ...
## $ RID : Factor w/ 174 levels " ","R1 ",..: 144 150 28 20 1 112 90 42 90 140 ...
## $ PROMO : Factor w/ 5 levels " ",..: 2 4 2 2 1 2 4 4 3 2 ...
## $ PROMO2 : Factor w/ 4 levels "EE Declines",..: 2 3 2 2 NA 2 3 3 1 2 ...
## $ PROMO3 : Factor w/ 2 levels "Decline or Do Not Rec",..: 1 2 1 1 NA 1 2 2 1 1 ...
## $ PROMO4 : Factor w/ 2 levels "Recommend","Strongly Recommend": NA 1 NA NA NA NA 1 1 NA NA ...
## $ ACCOUNT : num 0 0 0 0 NA 0 0 0 0 0 ...
## $ CONFLICT: num 0 0 1 0 NA 0 0 0 0 0 ...
## $ CONTINU : num 0 0 0 0 NA 4 0 0 0 0 ...
## $ CREATIV : num 0 0 0 0 NA 0 0 0 0 0 ...
## $ CULTURAL: num 0 0 0 0 NA 0 0 0 0 0 ...
## $ CUSTOMER: num 0 0 0 0 NA 0 1 0 0 0 ...
## $ DECISIVE: num 0 0 0 0 NA 0 0 0 0 0 ...
## $ ENTREPRE: num 0 0 1 0 NA 0 0 0 0 0 ...
## $ EXTERNAL: num 0 0 0 0 NA 0 0 0 0 0 ...
## $ FINANCIA: num 0 0 1 1 NA 1 0 0 0 0 ...
## $ FLEXIBIL: num 0 0 1 1 NA 0 0 0 0 0 ...
## $ HUMAN_RE: num 0 0 0 0 NA 2 1 0 0 0 ...
## $ INFLUENC: num 0 0 1 0 NA 0 1 0 0 0 ...
## $ INTEGRIT: num 0 0 0 0 NA 0 1 0 0 0 ...
## $ INTERPER: num 0 0 0 0 NA 0 0 0 0 0 ...
## $ ORAL_COM: num 0 0 0 0 NA 0 0 0 0 0 ...
## $ PARTNERI: num 0 0 0 0 NA 0 0 0 0 0 ...
## $ POLITICA: num 0 0 0 0 NA 0 0 0 0 0 ...
## $ PROBLEM : num 0 0 0 0 NA 0 0 0 0 0 ...
## $ RESILIEN: num 0 0 0 0 NA 0 0 0 0 0 ...
## $ SERVICE : num 0 0 1 0 NA 3 0 0 0 0 ...
## $ STRATEGI: num 0 0 0 0 NA 0 0 0 0 0 ...
## $ TEAM_BUI: num 0 0 2 0 NA 1 0 0 0 0 ...
## $ TECH_CR : num 0 0 1 0 NA 2 1 0 0 0 ...
## $ TECH_MGT: num 0 0 0 0 NA 0 0 0 0 0 ...
## $ VISION : num 0 0 0 0 NA 0 0 0 0 0 ...
## $ WRITTEN : num 0 0 0 0 NA 0 0 0 0 0 ...
## $ LEAD_CH : num 0 0 2 1 NA 7 0 0 0 0 ...
## $ LEAD_P : num 0 0 3 0 NA 1 1 0 0 0 ...
## $ RESULTS : num 0 0 2 0 NA 2 2 0 0 0 ...
## $ BUSINESS: num 0 0 1 1 NA 3 1 0 0 0 ...
## $ BUILDING: num 0 0 1 0 NA 0 1 0 0 0 ...
## $ PEOPLE : num 0 0 4 0 NA 1 2 0 0 0 ...
## $ TASK : num 0 0 3 1 NA 5 3 0 0 0 ...
## $ TASK2 : num 0 0 3 1 NA 5 2 0 0 0 ...
## $ PEOPLE2 : num 0 0 4 0 NA 1 3 0 0 0 ...
## $ P_LD_CH : num NA 0 0.0328 0.0714 NA ...
## $ P_LD_P : num NA 0 0.0492 0 NA ...
## $ P_RESULT: num NA 0 0.0328 0 NA ...
## $ P_BUSIN : num NA 0 0.0164 0.0714 NA ...
## $ P_BUILD : num NA 0 0.0164 0 NA ...
## $ WRDCOUNT: num 0 3 61 14 90 116 48 9 12 25 ...
## $ LEADERSH: num 0 0 9 2 NA 13 5 0 0 0 ...
## $ ETH_SETH: num 0 1 1 1 NA 1 1 1 1 0 ...
## $ SEX_SEX2: num 0 0 0 1 NA 1 0 1 1 0 ...
## $ FILTER_.: Factor w/ 2 levels "Not Selected",..: 1 2 1 1 1 1 1 1 1 1 ...
## $ O_A : num 0 0 1 0 0 0 0 0 0 0 ...
## $ O_F : num 0 0 0 0 0 0 0 0 0 0 ...
## $ O_H : num 0 0 0 0 0 0 0 0 0 0 ...
## $ O_I : num 0 0 0 0 0 0 0 0 0 1 ...
## $ O_O : num 0 0 0 0 0 0 0 0 0 0 ...
## $ O_P : num 1 0 0 1 0 1 1 1 1 0 ...
## $ O_S : num 0 0 0 0 0 0 0 0 0 0 ...
Graphical displays of the data seem to be a helpful start to figuring out what is going on with the data.
First, how are the promotion recommendations distributed?
promo.count <- table(spssdata$PROMO2)
## windows()
barplot(promo.count, ylim = c(0, max(promo.count + 75)))
text(seq(0.75, 4.5, by = 1.2), promo.count + 50, promo.count)
There is an interesting comparison to be made of the overall distribution of word counts, which I think is The total number of words supervisors used in their descriptions of the person they are recommending for promotion, and the four conditional distributions.
hist(spssdata$WRDCOUNT)
histogram(~WRDCOUNT | PROMO2, spssdata)
histogram(~LEADERSH, spssdata, subset = !is.na(LEADERSH))
histogram(~LEADERSH | PROMO2, spssdata, subset = !is.na(LEADERSH))
l.dat <- spssdata$LEADERSH[!is.na(spssdata$LEADERSH)]
descdist(l.dat, boot = 1000)
## summary statistics
## ------
## min: 0 max: 68
## median: 6
## mean: 8.74
## estimated sd: 9.557
## estimated skewness: 1.08
## estimated kurtosis: 4.081
## Fit normal distribution and plot fit
fn <- fitdist(l.dat, "norm", method = "mle")
plot(fn)
summary(fn)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters :
## estimate Std. Error
## mean 8.740 0.1959
## sd 9.555 0.1385
## Loglikelihood: -8741 AIC: 17487 BIC: 17499
## Correlation matrix:
## mean sd
## mean 1.000e+00 6.172e-09
## sd 6.172e-09 1.000e+00
If the EE Declines, there is essentially no distribution, so it doesn't make send to try the graph.
with(subset(spssdata, PROMO2 == "EE Declines"), table(LEADERSH, useNA = "ifany"))
## LEADERSH
## 0
## 437
If they are not recommended, then it seems more like an exponential or power law distribution.
l.dat <- with(spssdata, LEADERSH[!is.na(LEADERSH) & PROMO2 == "Do Not Rec"])
descdist(l.dat, boot = 1000)
## summary statistics
## ------
## min: 0 max: 40
## median: 3
## mean: 6.062
## estimated sd: 7.5
## estimated skewness: 1.366
## estimated kurtosis: 4.274
## Fit normal distribution and plot fit
fn <- fitdist(l.dat, "norm", method = "mle")
plot(fn)
summary(fn)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters :
## estimate Std. Error
## mean 6.062 0.2555
## sd 7.496 0.1806
## Loglikelihood: -2956 AIC: 5916 BIC: 5926
## Correlation matrix:
## mean sd
## mean 1 0
## sd 0 1
The plot does not seem to agree with my eyes, however.
Those who are recommended look to my eye as if they are normally distributed but truncated. I am not sure how to test for that although we could either find something or make something up.
l.dat <- with(spssdata, LEADERSH[!is.na(LEADERSH) & PROMO2 == "Recommend"])
descdist(l.dat, boot = 1000)
## summary statistics
## ------
## min: 0 max: 39
## median: 12
## mean: 12.57
## estimated sd: 8.67
## estimated skewness: 0.5539
## estimated kurtosis: 2.719
## Fit normal distribution and plot fit
fn <- fitdist(l.dat, "norm", method = "mle")
plot(fn)
summary(fn)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters :
## estimate Std. Error
## mean 12.572 0.3245
## sd 8.664 0.2294
## Loglikelihood: -2551 AIC: 5106 BIC: 5115
## Correlation matrix:
## mean sd
## mean 1 0
## sd 0 1
The figure looks like it is consistent with this hypothesis. If you truncate a normal distribution, you might lower kurtosis and increase skew.
Those who are recommended look normal.
l.dat <- with(spssdata, LEADERSH[!is.na(LEADERSH) & PROMO2 == "Strongly Rec"])
descdist(l.dat, boot = 1000)
## summary statistics
## ------
## min: 0 max: 68
## median: 18
## mean: 17.98
## estimated sd: 9.612
## estimated skewness: 0.9349
## estimated kurtosis: 6.142
## Fit normal distribution and plot fit
fn <- fitdist(l.dat, "norm", method = "mle")
plot(fn)
summary(fn)
## Fitting of the distribution ' norm ' by maximum likelihood
## Parameters :
## estimate Std. Error
## mean 17.984 0.5011
## sd 9.599 0.3543
## Loglikelihood: -1351 AIC: 2706 BIC: 2713
## Correlation matrix:
## mean sd
## mean 1 0
## sd 0 1
Visually, I missed the right tail.
Thus, we will only use do not recommend, recommend, and strongly recommend ratings.
ldr.nd <- subset(spssdata, subset = (PROMO2 != "EE Declines"), select = c(LEAD_CH,
LEAD_P, RESULTS, BUSINESS, BUILDING, LEADERSH, WRDCOUNT))
These are the broader competencies that are made up of “sub-competencies.” This will keep the counts at a reasonable level. I checked the OPM ECQ website and they have changed since the original analysis was done, but I can explain this. A single lattice plot with the plots would probably work well.
ldr.nd.l <- stack(ldr.nd)
histogram(~values | ind, ldr.nd.l)
count.hist <- function(X, name) {
hist(X, col = "gray", main = name, xlab = "", freq = TRUE)
}
count.hist(ldr.nd$LEAD_CH, "Leading Change")
count.hist(ldr.nd$LEAD_P, "Leading People")
count.hist(ldr.nd$RESULTS, "Results Driven")
count.hist(ldr.nd$BUSINESS, "Business Acumen")
count.hist(ldr.nd$BUILDING, "Team Building")
count.hist(ldr.nd$LEADERSH, "Sum of Leadership")
count.hist(ldr.nd$WRDCOUNT, "Total Words")
do.plfit <- function(X, title = "") {
fit.pl <- displ(X)
estimate_pars(fit.pl)
est = estimate_xmin(fit.pl)
fit.pl$setXmin(est)
plot(fit.pl, main = title)
pred <- lines(fit.pl, col = "blue")
return(list(fit = fit.pl, pred = pred))
}
# wrdcount.plfit <- do.plfit(ldr.nd$WRDCOUNT + 1, 'Wordcount')
lead.ch.plfit <- do.plfit(ldr.nd$LEAD_CH + 1, "Leading Change")
lead.p.plfit <- do.plfit(ldr.nd$LEAD_P + 1, "Leading People")
results.plfit <- do.plfit(ldr.nd$RESULTS + 1, "Results Driven")
business.plfit <- do.plfit(ldr.nd$BUSINESS + 1, "Business Acumen")
building.plfit <- do.plfit(ldr.nd$BUILDING + 1, "Team Building")
leadersh.plfit <- do.plfit(ldr.nd$LEADERSH + 1, "Sum of Leadership")
wrdcount.plfit <- do.plfit(ldr.nd$WRDCOUNT + 1, "Total Words")
fit.dists.bic <- function(X) {
X <- X + 0.5
norm.fd <- fitdist(X, pnorm, method = "mle", start = list(mean = mean(X),
sd = sd(X)))
norm.fd$ks <- ks.test(X, pnorm, norm.fd$estimate["mean"], norm.fd$estimate["sd"])$statistic
tnorm.fd <- fitdist(X, ptruncnorm, method = "mle", start = list(mean = mean(X)/2,
sd = sd(X)), fix.arg = list(a = 0, b = Inf))
tnorm.fd$ks <- ks.test(X, ptruncnorm, 0, Inf, tnorm.fd$estimate["mean"],
tnorm.fd$estimate["sd"])$statistic
lnorm.fd <- fitdist(X, plnorm, method = "mle", start = list(meanlog = log(mean(X)),
sdlog = log(sd(X))))
lnorm.fd$ks <- ks.test(X, plnorm, lnorm.fd$estimate["meanlog"], lnorm.fd$estimate["sdlog"])$statistic
pl.fd <- fitdist(X, pplcon, method = "mle", start = list(alpha = mean(X)),
fix.arg = list(xmin = 0.5))
pl.fd$ks <- ks.test(X, pplcon, as.numeric(pl.fd$fix.arg["xmin"]), pl.fd$estimate["alpha"])$statistic
exp.fd <- fitdist(X, pexp, method = "mle", start = list(rate = 1/mean(X -
1)))
exp.fd$ks <- ks.test(X, pexp, exp.fd$estimate["rate"])$statistic
return(list(norm.fd = norm.fd, tnorm.fd = tnorm.fd, lnorm.fd = lnorm.fd,
pl.fd = pl.fd, exp.fd = exp.fd))
}
fit.dists.ks <- function(X) {
norm.fd <- fitdist(X, pnorm, method = "mge", gof = "KS", start = list(mean = mean(X),
sd = sd(X)))
norm.fd$ks <- ks.test(X, pnorm, norm.fd$estimate["mean"], norm.fd$estimate["sd"])$statistic
tnorm.fd <- fitdist(X + 0.5, ptruncnorm, method = "mge", gof = "KS", start = list(mean = mean(X),
sd = sd(X)), fix.arg = list(a = -1, b = Inf))
tnorm.fd$ks <- ks.test(X + 0.5, ptruncnorm, -0.5, Inf, tnorm.fd$estimate["mean"],
tnorm.fd$estimate["sd"])$statistic
lnorm.fd <- fitdist(X + 0.5, plnorm, method = "mge", gof = "KS", start = list(meanlog = log(mean(X)),
sdlog = log(sd(X))))
lnorm.fd$ks <- ks.test(X, plnorm, lnorm.fd$estimate["meanlog"], lnorm.fd$estimate["sdlog"])$statistic
pl.fd <- fitdist(X + 0.5, pplcon, method = "mge", gof = "KS", start = list(alpha = log(mean(X) +
1)/1.5), fix.arg = list(xmin = 1))
pl.fd$ks <- ks.test(X, pplcon, as.numeric(pl.fd$fix.arg["xmin"]), pl.fd$estimate["alpha"])$statistic
exp.fd <- fitdist(X + 0.5, pexp, method = "mge", gof = "KS", start = list(rate = 1/mean(X -
1)))
exp.fd$ks <- ks.test(X - 1, pexp, exp.fd$estimate["rate"])$statistic
return(list(norm.fd = norm.fd, tnorm.fd = tnorm.fd, lnorm.fd = lnorm.fd,
pl.fd = pl.fd, exp.fd = exp.fd))
}
lead.ch.fits.bic <- fit.dists.bic(ldr.nd$LEAD_CH)
lead.p.fits.bic <- fit.dists.bic(ldr.nd$LEAD_P)
results.fits.bic <- fit.dists.bic(ldr.nd$RESULTS)
business.fits.bic <- fit.dists.bic(ldr.nd$BUSINESS)
building.fits.bic <- fit.dists.bic(ldr.nd$BUILDING)
leadersh.fits.bic <- fit.dists.bic(ldr.nd$LEADERSH)
wrdcount.fits.bic <- fit.dists.bic(ldr.nd$WRDCOUNT)
# lead.ch.fits.ks <- fit.dists.ks(ldr.nd$LEAD_CH) lead.p.fits.ks <-
# fit.dists.ks(ldr.nd$LEAD_P) results.fits.ks <-
# fit.dists.ks(ldr.nd$RESULTS) business.fits.ks <-
# fit.dists.ks(ldr.nd$BUSINESS) building.fits.ks <-
# fit.dists.ks(ldr.nd$BUILDING) leadersh.fits.ks <-
# fit.dists.ks(ldr.nd$LEADERSH) wrdcount.fits.ks <-
# fit.dists.ks(ldr.nd$WRDCOUNT)
fit.bic <- rbind(sapply(lead.ch.fits.bic, function(X) X$bic), sapply(lead.p.fits.bic,
function(X) X$bic), sapply(results.fits.bic, function(X) X$bic), sapply(business.fits.bic,
function(X) X$bic), sapply(building.fits.bic, function(X) X$bic), sapply(leadersh.fits.bic,
function(X) X$bic), sapply(wrdcount.fits.bic, function(X) X$bic))
rownames(fit.bic) <- c("lead.ch", "lead.p", "results", "business", "building",
"leadersh", "wrdcount")
fit.bic
## norm.fd tnorm.fd lnorm.fd pl.fd exp.fd
## lead.ch 9491 8400 8444 8114 8444
## lead.p 8750 7549 7459 6569 7571
## results 9737 8391 8340 7711 8395
## business 8240 6971 6736 5732 6984
## building 8029 6551 6236 4659 6547
## leadersh 14274 13219 13703 14369 13271
## wrdcount 21213 20630 21222 25357 20909
matplot(fit.bic, type = "b", pch = "NTLPE", ylog = TRUE, ylab = "BIC")
fit.ks <- rbind(sapply(lead.ch.fits.bic, function(X) X$ks), sapply(lead.p.fits.bic,
function(X) X$ks), sapply(results.fits.bic, function(X) X$ks), sapply(business.fits.bic,
function(X) X$ks), sapply(building.fits.bic, function(X) X$ks), sapply(leadersh.fits.bic,
function(X) X$ks), sapply(wrdcount.fits.bic, function(X) X$ks))
rownames(fit.ks) <- c("lead.ch", "lead.p", "results", "business", "building",
"leadersh", "wrdcount")
fit.ks
## norm.fd.D tnorm.fd.D lnorm.fd.D pl.fd.D exp.fd.D
## lead.ch 0.16345 0.16044 0.2004 0.2813 0.1433
## lead.p 0.18660 0.19776 0.2410 0.3539 0.1779
## results 0.19006 0.17562 0.2125 0.3101 0.1652
## business 0.22586 0.18383 0.2404 0.3601 0.2017
## building 0.22771 0.23042 0.2910 0.4462 0.2233
## leadersh 0.13054 0.13725 0.1616 0.2822 0.1290
## wrdcount 0.08285 0.07192 0.1238 0.4164 0.1087
matplot(fit.ks, type = "b", pch = "NTLPE", ylab = "Kolmogorov-Smirnov D")
plot.fits <- function(fits, label) {
hist(fits$norm.fd$data, freq = FALSE, col = "gray", main = "Normal Dist.",
xlab = label)
curve(dnorm(x, fits$norm.fd$estimate["mean"], fits$norm.fd$estimate["sd"]),
add = TRUE, col = "red")
hist(fits$norm.fd$data, freq = FALSE, col = "gray", main = "Truncated Normal Dist.",
xlab = label)
curve(dtruncnorm(x, 0, Inf, fits$tnorm.fd$estimate["mean"], fits$tnorm.fd$estimate["sd"]),
add = TRUE, col = "red")
hist(fits$norm.fd$data, freq = FALSE, col = "gray", main = "Log Normal Dist.",
xlab = label)
curve(dlnorm(x, fits$lnorm.fd$estimate["meanlog"], fits$lnorm.fd$estimate["sdlog"]),
add = TRUE, col = "red")
hist(fits$norm.fd$data, freq = FALSE, col = "gray", main = "Power Law Dist.",
xlab = label)
curve(dplcon(x, as.numeric(fits$pl.fd$fix.arg["xmin"]), fits$pl.fd$estimate["alpha"]),
add = TRUE, col = "red")
}
plot.pdfs <- function(fits, label) {
denscomp(fits$norm.fd, main = "Normal Dist.", xlab = label)
denscomp(fits$tnorm.fd, main = "Truncated Normal Dist.", xlab = label)
denscomp(fits$lnorm.fd, main = "Log Normal Dist.", xlab = label)
denscomp(fits$pl.fd, main = "Power Law Dist.", xlab = label)
denscomp(fits$exp.fd, main = "Exponential Dist.", xlab = label)
}
plot.cdfs <- function(fits, label) {
cdfcompcall <- function(fit, main, ...) cdfcomp(fit, xlogscale = FALSE,
ylogscale = FALSE, main = main, xlab = label, addlegend = FALSE, ...)
cdfcompcall(fits$norm.fd, main = "Normal Dist.")
cdfcompcall(fits$tnorm.fd, main = "Truncated Normal Dist.")
cdfcompcall(fits$lnorm.fd, main = "Log Normal Dist.")
cdfcompcall(fits$pl.fd, main = "Power Law Dist.")
cdfcompcall(fits$exp.fd, main = "Exponential Dist.")
}
# plot.fits(lead.ch.fits.bic, 'Leading Change') plot.fits(lead.p.fits.bic,
# 'Leading People') plot.fits(results.fits.bic, 'Results Driven')
# plot.fits(business.fits.bic, 'Business Acumen')
# plot.fits(building.fits.bic, 'Team Building') plot.fits(leadersh.fits.bic,
# 'Sum of Leadership') plot.fits(wrdcount.fits.bic, 'Total Words')
plot.cdfs(lead.ch.fits.bic, "Leading Change")
plot.cdfs(lead.p.fits.bic, "Leading People")
plot.cdfs(results.fits.bic, "Results Driven")
plot.cdfs(business.fits.bic, "Business Acumen")
plot.cdfs(building.fits.bic, "Team Building")
plot.cdfs(leadersh.fits.bic, "Sum of Leadership")
plot.cdfs(wrdcount.fits.bic, "Total Words")
plot.pdfs(lead.ch.fits.bic, "Leading Change")
plot.pdfs(lead.p.fits.bic, "Leading People")
plot.pdfs(results.fits.bic, "Results Driven")
plot.pdfs(business.fits.bic, "Business Acumen")
plot.pdfs(building.fits.bic, "Team Building")
plot.pdfs(leadersh.fits.bic, "Sum of Leadership")
plot.pdfs(wrdcount.fits.bic, "Total Words")
cdfcomp(list(lead.ch.fits.bic$exp.fd, lead.ch.fits.bic$pl.fd), main = "Leading Change",
xlab = "Word Count", legendtext = c("Exponential", "Power Law"), fitcol = "black",
fitlty = 2:3, lwd = 2)
cdfcomp(list(wrdcount.fits.bic$tnorm.fd, wrdcount.fits.bic$pl.fd), main = "Total Words",
xlab = "Word Count", legendtext = c("Truncated Normal", "Power Law"), fitcol = "black",
fitlty = 2:3, lwd = 2)
Does this sound reasonable? It is quite simple. If you are interested in doing something more methodological that’s no problem. I would just like to get something done for sure and then if time is available we can do other things.
pl.gof <- function(N = 100, x.min = 1, alpha = 2) {
A <- round(rplcon(N, x.min, alpha))
# A.fd <- fitdist(A, pplcon, method='mle', start=list(alpha=alpha),
# fix.arg=list(xmin=x.min))
A.fd <- fitdist(A, pplcon, method = "mge", gof = "KS", start = list(alpha = alpha),
fix.arg = list(xmin = x.min))
A.x.min <- as.numeric(A.fd$fix.arg["xmin"])
A.alpha <- as.numeric(A.fd$estimate["alpha"])
return(c(bic = A.fd$bic, ks = ks.test(A, pplcon, A.x.min, A.alpha)$statistic))
}
do.ks.bic.sim <- function(fit, reps = 1000) {
ks.bic.fits <- replicate(reps, pl.gof(length(fit$pl.fd$data), as.numeric(fit$pl.fd$fix.arg["xmin"]),
as.numeric(fit$pl.fd$estimate["alpha"])))
return(list(bic = sum(ks.bic.fits["bic", ] > fit$pl.fd$bic)/reps, ks = sum(ks.bic.fits["ks.D",
] > fit$pl.fd$ks)/reps, res = ks.bic.fits))
}
reps <- 500
pl.p.val.res <- list(do.ks.bic.sim(lead.ch.fits.bic, reps), do.ks.bic.sim(lead.p.fits.bic,
reps), do.ks.bic.sim(results.fits.bic, reps), do.ks.bic.sim(business.fits.bic,
reps), do.ks.bic.sim(building.fits.bic, reps), do.ks.bic.sim(leadersh.fits.bic,
reps), do.ks.bic.sim(wrdcount.fits.bic, reps))
p.tab <- sapply(pl.p.val.res, function(X) c(X[[1]], X[[2]]))
rownames(p.tab) <- c("BIC", "KS-D")
colnames(p.tab) <- c("lead.ch", "lead.p", "results", "business", "building",
"leadersh", "wrdcount")
p.tab
## lead.ch lead.p results business building leadersh wrdcount
## BIC 1.00 1 1 1 1 0.98 0.754
## KS-D 0.02 0 0 0 0 0.00 0.000