require(moments)
## Loading required package: moments
require(car)
## Loading required package: car
## Loading required package: carData
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(fastDummies)
## Loading required package: fastDummies
require(kableExtra)
## Loading required package: kableExtra
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
require(queueing)
## Loading required package: queueing
require(ResourceSelection)
## Loading required package: ResourceSelection
## ResourceSelection 0.3-5 2019-07-22
myprint=function(x) { x%>%kbl()%>%kable_classic(html_font='Cambria')}
Prevalence is the probability that a condition exists in any givent population.
Sensitivity (also known as recall or the True Positive Rate) is defined as the probability of a positive detection given the case is truly positive.
Specificity (also known as selectivity or the True Negative Rate) is defined as the probability of a negative detection given the case is truly negative.
Precision or positive predictive value is defined as the probability a case is truly positive given that it was classified as positive.
Negative predictive value is defined as the probability a case is truly negative given that it was classified as negative.
The F1 score is \(\frac {2 \times True Positive}{2 \times True Positive+FalsePositive+FalseNegative}\). In other words, it is a mixture metric.
Write your own function in R that will take sensitivity, specificity, and prevalence and then compute and return accuracy, positive predictive value, the negative predictive value, the false positive rate, the false negative rate, and the F1 score.
mymetrics=function(sens,spec,prev){
if (sens<0 |sens>1) return ('Sensitivity Error') #Check validity of input
if (spec<0 |spec>1) return ('Specificity Error')
if (prev<0 |prev>1) return ('Prevalence Error')
n=1000 #Assign a random population
total_disease=n*prev #Estimate diseased in the population
total_nodisease=1000-total_disease #Estimated non-diseased
pos_disease=sens*total_disease #Estimate TP
neg_nodisease=spec*total_nodisease #Estimate TN
neg_disease=total_disease-pos_disease #Estimate FN
pos_nodisease=total_nodisease-neg_nodisease #Estimate FP
#Build Confusion Matrix
myt=matrix(c(pos_disease,neg_disease,total_disease,
pos_nodisease,neg_nodisease,total_nodisease,
pos_disease+pos_nodisease,neg_disease+neg_nodisease,total_disease+total_nodisease)
, nrow=3)
#Augment with PPV/NPV/ACC
myt=cbind(myt,c(myt[1,1]/myt[1,3], myt[2,2]/myt[2,3],sum(diag(myt[,1:2]))/n))
#Augment with FPR/NPR/F1
myt=cbind(myt, c(1-spec,1-sens,2*(myt[1,4]*sens/(myt[1,4]+sens))))
rownames(myt)=c("Pos","Neg","Tot")
colnames(myt)=c("Dis","No Dis", "Tot", "PPV/NPV/ACC", "FPR/FNR/F1")
return(round(myt,4))
}
Assume a classification algorithm for detecting cancer is .85 to .95 sensitive (you choose) and .80 to .90 specific (you choose) as well as .01 to .05 prevalent (you choose). Using your function, what is the probability that a randomly selected individual who test positive actually has cancer, P(Cancer | Positive Test)?
Provide all accuracy metrics and interpret.
ans=mymetrics(.9,.82,.01)
myprint(ans)
| Dis | No Dis | Tot | PPV/NPV/ACC | FPR/FNR/F1 | |
|---|---|---|---|---|---|
| Pos | 9 | 178.2 | 187.2 | 0.0481 | 0.1800 |
| Neg | 1 | 811.8 | 812.8 | 0.9988 | 0.1000 |
| Tot | 10 | 990.0 | 1000.0 | 0.8208 | 0.0913 |
print(noquote(c('P(Cancer | +) is ',ans[1,4])))
## [1] P(Cancer | +) is 0.0481
While 82% accurate, the low PPV means that very few of the positives actually have the disease, P(Cancer | +). The accuracy is bolstered by the low prevalence of 1%. The NPV of 0.99877 is bolstered by the sheer number of negatives.
Probability questions can often be answered by multiple distributions if the assumptions hold. Build an R function manually (not a built-in R functions) that takes in a fixed sample size, the probability of success, the number of successes, and the directionality (equal, not equal, greater than or equal to, less than or equal to) and returns a. binomial, b. Poisson (if assumptions hold), c.a normal approximation of the binomial probability if the assumptions hold
myprob=function(x,N,p, dir='='){
lambda=N*p #calculate lambda
Mu=N*p #calculate Mu
sigma=sqrt(N*p*(1-p)) #calculate sigma
p1=1/(sigma*sqrt(2*3.141593)) #calculate the first part of the normal distribution
if(dir=='='){
myb=choose(N,x)*p^x*(1-p)^(N-x)
myp=exp(-lambda)*lambda^x/factorial(x)
myn=p1*exp(-.5*((x-.5-Mu)/sigma)^2)-p1*exp(-.5*((x+.5-Mu)/sigma)^2)
myanswers=round(c(myb,myp,myn),4)
}
if(dir=='<>'){
myb=1-choose(N,x)*p^x*(1-p)^(N-x)
myp=1-exp(-lambda)*lambda^x/factorial(x)
myn=1-(p1*exp(-.5*((x-.5-Mu)/sigma)^2)-p1*exp(-.5*((x+.5-Mu)/sigma)^2))
myanswers=round(c(myb,myp,myn),4)
}
if(dir=='<='){
myb=0
myp=0
p2=exp(-.5*((x+.5-Mu)/sigma)^2)
myn=p1*p2
for (i in 0:x){
myb=myb+choose(N,i)*p^i*(1-p)^(N-i)
myp=myp+exp(-lambda)*lambda^i/factorial(i)
}
myanswers=round(c(myb,myp,myn),4)
}
if(dir=='>='){
myb=0
myp=0
p2=exp(-.5*((x+.5-Mu)/sigma)^2)
myn=p1*p2
for (i in x:N){
myb=myb+choose(N,i)*p^i*(1-p)^(N-i)
myp=myp+exp(-lambda)*lambda^i/factorial(i)
}
myanswers=round(c(myb,myp,myn),4)
}
if (N*p<5 | N*(1-p)<5) myanswers[3]='NA'
if (N<20) myanswers[2]='NA'
if(min(N*p,N*(1-p))>5) myanswers[2]='NA'
names(myanswers)=c('B','P','N')
return(noquote(myanswers))
}
forprint=cbind(myprob(8,20,.2,'='), myprob(8,20,.2,'<='), myprob(8,10,.5,">="))
colnames(forprint)=c('Prob 1: 8,20,.2, =', 'Prob 2: 8, 20, .2, <=', 'Probb 3: 8, 10, .5, >=')
myprint(forprint)
| Prob 1: 8,20,.2, = | Prob 2: 8, 20, .2, <= | Probb 3: 8, 10, .5, >= | |
|---|---|---|---|
| B | 0.0222 | 0.99 | 0.0547 |
| P | 0.0298 | 0.9786 | NA |
| N | NA | NA | 0.0218 |
Write a function (manually, not using built-in R functions) that returns measures of center, dispersion, location, and scale for variables in an R dataframe. The function should return only those measurements appropriate for the data type (e.g., nominal, ordinal, interval, and ratio.) The function should also return at least one appropriate graph for each data type and should handle missing data.
med=function (x, na.rm = FALSE) {
if (is.factor(x) || is.data.frame(x)) stop("need numeric data")
if (length(x) ==0) stop("no data")
half=(length(x) + 1L)%/%2L
if (length(x)%%2L == 1L) sort(x, partial = half)[half]
else mean(sort(x, partial = half + 0L:1L)[half + 0L:1L])
}
mysd=function(x){
if (is.factor(x) || is.data.frame(x)) stop("need numeric data")
sqrt(sum((x-(sum(x)/length(x)))^2)/length(x))
}
desc=function(a)
{
a=dummy_cols(a, remove_selected_columns = T)
myf=function(x) {x=na.omit(x); c(length(x), sum(x)/length(x), med(x),x[which.max(table(x))],
mysd(x), mysd(x)^2,IQR(x), max(x)-min(x),
mad(x, center=mean(x)), mad(x),
100*sd(x)/mean(x),as.numeric(quantile(x,c(0,.05,.25,.75,.95,1),na.rm=TRUE)),
skewness(x), kurtosis(x))}
tmp=apply(a, 2, FUN=myf)
rownames(tmp)=c("n","Mean","Median","Mode (1st)","SD", "Var", "IQR", "Range",
"MAD", "MedAD", "CV", "Min", "5th%", "25%", "75%", "95%", "Max", "Skew", "Kurtosis")
par(mfrow=c(2,2))
for(i in 1:ncol(a)){
if(length(unique(a[,i]))>2){
hist(a[,i],main= colnames(a)[i])
boxplot(a[,i], notch=F, horizontal=T,main=colnames(a)[i])}else
{barplot(table(a[,i]), main=colnames(a)[i])}}
return(as.data.frame(tmp))
}
desc(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa
## n 150.0000000 150.0000000 150.0000000 150.0000000 150.0000000
## Mean 5.8433333 3.0573333 3.7580000 1.1993333 0.3333333
## Median 5.8000000 3.0000000 4.3500000 1.3000000 0.0000000
## Mode (1st) 5.0000000 3.1000000 1.4000000 0.2000000 1.0000000
## SD 0.8253013 0.4344110 1.7594041 0.7596926 0.4714045
## Var 0.6811222 0.1887129 3.0955027 0.5771329 0.2222222
## IQR 1.3000000 0.5000000 3.5000000 1.5000000 1.0000000
## Range 3.6000000 2.4000000 5.9000000 2.4000000 1.0000000
## MAD 0.9735740 0.3815224 2.6568192 1.1850916 0.4942000
## MedAD 1.0378200 0.4447800 1.8532500 1.0378200 0.0000000
## CV 14.1711260 14.2564201 46.9744075 63.5551141 141.8951310
## Min 4.3000000 2.0000000 1.0000000 0.1000000 0.0000000
## 5th% 4.6000000 2.3450000 1.3000000 0.2000000 0.0000000
## 25% 5.1000000 2.8000000 1.6000000 0.3000000 0.0000000
## 75% 6.4000000 3.3000000 5.1000000 1.8000000 1.0000000
## 95% 7.2550000 3.8000000 6.1000000 2.3000000 1.0000000
## Max 7.9000000 4.4000000 6.9000000 2.5000000 1.0000000
## Skew 0.3117531 0.3157671 -0.2721277 -0.1019342 0.7071068
## Kurtosis 2.4264321 3.1809763 1.6044641 1.6639326 1.5000000
## Species_versicolor Species_virginica
## n 150.0000000 150.0000000
## Mean 0.3333333 0.3333333
## Median 0.0000000 0.0000000
## Mode (1st) 0.0000000 0.0000000
## SD 0.4714045 0.4714045
## Var 0.2222222 0.2222222
## IQR 1.0000000 1.0000000
## Range 1.0000000 1.0000000
## MAD 0.4942000 0.4942000
## MedAD 0.0000000 0.0000000
## CV 141.8951310 141.8951310
## Min 0.0000000 0.0000000
## 5th% 0.0000000 0.0000000
## 25% 0.0000000 0.0000000
## 75% 1.0000000 1.0000000
## 95% 1.0000000 1.0000000
## Max 1.0000000 1.0000000
## Skew 0.7071068 0.7071068
## Kurtosis 1.5000000 1.5000000
Run this function on the training set of the Titanic dataset.
Discuss what the descriptive analysis tells us about the data.
mydata=read.csv('c:/users/lfult/documents/titanic/train.csv', stringsAsFactors = T)
mydata$Sex=factor(mydata$Sex)
mydata$Embarked=factor(mydata$Embarked)
tmp=factor(mydata$Name)
tmp2=factor(mydata$Ticket)
tmp3=factor(mydata$Cabin)
tmp4=mydata$PassengerId
tmp=mydata$Name
mydata$Name=mydata$Ticket=mydata$Cabin=mydata$PassengerId=NULL #Not needed for analysis
as.data.frame(round(t(desc(mydata)),3))
## n Mean Median Mode (1st) SD Var IQR Range MAD
## Survived 891 0.384 0.000 0.000 0.486 0.237 1.000 1.000 0.569
## Pclass 891 2.309 3.000 3.000 0.836 0.698 1.000 2.000 1.025
## Age 714 29.699 28.000 40.000 14.516 210.724 17.875 79.580 13.268
## SibSp 891 0.523 0.000 1.000 1.102 1.215 1.000 8.000 0.775
## Parch 891 0.382 0.000 0.000 0.806 0.649 0.000 6.000 0.566
## Fare 891 32.204 14.454 41.579 49.666 2466.665 23.090 512.329 34.903
## Sex_female 891 0.352 0.000 0.000 0.478 0.228 1.000 1.000 0.522
## Sex_male 891 0.648 1.000 0.000 0.478 0.228 1.000 1.000 0.522
## Embarked_ 891 0.002 0.000 0.000 0.047 0.002 0.000 1.000 0.003
## Embarked_C 891 0.189 0.000 0.000 0.391 0.153 0.000 1.000 0.280
## Embarked_Q 891 0.086 0.000 0.000 0.281 0.079 0.000 1.000 0.128
## Embarked_S 891 0.723 1.000 0.000 0.448 0.200 1.000 1.000 0.411
## MedAD CV Min 5th% 25% 75% 95% Max Skew
## Survived 0.000 126.770 0.00 0.000 0.000 1 1.000 1.000 0.478
## Pclass 0.000 36.215 1.00 1.000 2.000 3 3.000 3.000 -0.629
## Age 13.343 48.912 0.42 4.000 20.125 38 56.000 80.000 0.388
## SibSp 0.000 210.846 0.00 0.000 0.000 1 3.000 8.000 3.689
## Parch 0.000 211.234 0.00 0.000 0.000 0 2.000 6.000 2.744
## Fare 10.236 154.307 0.00 7.225 7.910 31 112.079 512.329 4.779
## Sex_female 0.000 135.633 0.00 0.000 0.000 1 1.000 1.000 0.618
## Sex_male 0.000 73.811 0.00 0.000 0.000 1 1.000 1.000 -0.618
## Embarked_ 0.000 2109.501 0.00 0.000 0.000 0 0.000 1.000 21.036
## Embarked_C 0.000 207.567 0.00 0.000 0.000 0 1.000 1.000 1.592
## Embarked_Q 0.000 325.320 0.00 0.000 0.000 0 1.000 1.000 2.944
## Embarked_S 0.000 61.965 0.00 0.000 0.000 1 1.000 1.000 -0.995
## Kurtosis
## Survived 1.228
## Pclass 1.720
## Age 3.169
## SibSp 20.774
## Parch 12.717
## Fare 36.204
## Sex_female 1.382
## Sex_male 1.382
## Embarked_ 443.502
## Embarked_C 3.536
## Embarked_Q 9.666
## Embarked_S 1.991
mydata$Name=tmp
Complete exploratory data analysis of the Titanic training set by building cross tabulations of variables, imputing missing if / as appropriate, and creating new features from some of the fields. You can combine, split, etc. Feature creation is an important skill set, as you can often find new insights about your data.
Discuss everything you do.
levels(mydata$Embarked)=c("S", "C", "Q","S") #Recode Missing with the Mode
mydata$Age[is.na(mydata$Age)]=mean(mydata$Age,na.rm=TRUE)
mydata$Parch[mydata$Parch>=3]=3
newName=as.vector(mydata$Name)
#Now, let's split at the comma
mydf=data.frame(do.call('rbind', strsplit(as.character(mydata$Name),", ", fixed=TRUE)))
str(mydf)
## 'data.frame': 891 obs. of 2 variables:
## $ X1: chr "Braund" "Cumings" "Heikkinen" "Futrelle" ...
## $ X2: chr "Mr. Owen Harris" "Mrs. John Bradley (Florence Briggs Thayer)" "Miss. Laina" "Mrs. Jacques Heath (Lily May Peel)" ...
#Now, split the second column at the period.
newTitle=as.vector(mydf$X2)
mydf2=data.frame(do.call('rbind', strsplit(as.character(newTitle),".", fixed=TRUE)))
## Warning in rbind(c("Mr", " Owen Harris"), c("Mrs", " John Bradley (Florence
## Briggs Thayer)": number of columns of result is not a multiple of vector length
## (arg 1)
str(mydf2)
## 'data.frame': 891 obs. of 3 variables:
## $ X1: chr "Mr" "Mrs" "Miss" "Mrs" ...
## $ X2: chr " Owen Harris" " John Bradley (Florence Briggs Thayer)" " Laina" " Jacques Heath (Lily May Peel)" ...
## $ X3: chr "Mr" "Mrs" "Miss" "Mrs" ...
head(mydf2)
## X1 X2 X3
## 1 Mr Owen Harris Mr
## 2 Mrs John Bradley (Florence Briggs Thayer) Mrs
## 3 Miss Laina Miss
## 4 Mrs Jacques Heath (Lily May Peel) Mrs
## 5 Mr William Henry Mr
## 6 Mr James Mr
#We had better verify that col1 and col3 are identical
table(mydf2$X1,mydf2$X3)
##
## Barrett) Capt Col Don Dr Jonkheer Lady Major Master Miss Mlle
## Capt 0 1 0 0 0 0 0 0 0 0 0
## Col 0 0 2 0 0 0 0 0 0 0 0
## Don 0 0 0 1 0 0 0 0 0 0 0
## Dr 0 0 0 0 7 0 0 0 0 0 0
## Jonkheer 0 0 0 0 0 1 0 0 0 0 0
## Lady 0 0 0 0 0 0 1 0 0 0 0
## Major 0 0 0 0 0 0 0 2 0 0 0
## Master 0 0 0 0 0 0 0 0 40 0 0
## Miss 0 0 0 0 0 0 0 0 0 182 0
## Mlle 0 0 0 0 0 0 0 0 0 0 2
## Mme 0 0 0 0 0 0 0 0 0 0 0
## Mr 0 0 0 0 0 0 0 0 0 0 0
## Mrs 1 0 0 0 0 0 0 0 0 0 0
## Ms 0 0 0 0 0 0 0 0 0 0 0
## Rev 0 0 0 0 0 0 0 0 0 0 0
## Sir 0 0 0 0 0 0 0 0 0 0 0
## the Countess 0 0 0 0 0 0 0 0 0 0 0
##
## Mme Mr Mrs Ms Rev Sir the Countess
## Capt 0 0 0 0 0 0 0
## Col 0 0 0 0 0 0 0
## Don 0 0 0 0 0 0 0
## Dr 0 0 0 0 0 0 0
## Jonkheer 0 0 0 0 0 0 0
## Lady 0 0 0 0 0 0 0
## Major 0 0 0 0 0 0 0
## Master 0 0 0 0 0 0 0
## Miss 0 0 0 0 0 0 0
## Mlle 0 0 0 0 0 0 0
## Mme 1 0 0 0 0 0 0
## Mr 0 517 0 0 0 0 0
## Mrs 0 0 124 0 0 0 0
## Ms 0 0 0 1 0 0 0
## Rev 0 0 0 0 6 0 0
## Sir 0 0 0 0 0 1 0
## the Countess 0 0 0 0 0 0 1
#Put these values in a new variable
mydata$LastName=mydf$X1
mydata$Title=factor(mydf2$X1)
mydata$FirstName=mydf2$X2
str(mydata)
## 'data.frame': 891 obs. of 12 variables:
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : num 0 0 0 0 0 0 0 1 2 0 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked : Factor w/ 3 levels "S","C","Q": 1 2 1 1 1 3 1 1 1 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ LastName : chr "Braund" "Cumings" "Heikkinen" "Futrelle" ...
## $ Title : Factor w/ 17 levels "Capt","Col","Don",..: 12 13 9 13 12 12 12 8 13 13 ...
## $ FirstName: chr " Owen Harris" " John Bradley (Florence Briggs Thayer)" " Laina" " Jacques Heath (Lily May Peel)" ...
We notice some discrepancies…But for now, we are good.We would collapse levels for analysis as well.
boxplot(mydata$Age~mydata$Title, col=rainbow(10), las=2, xlab="", ylab='Age')
You can see that some of these variables should be considered/recoded to factor level.
temp=mydata[,-c(6:12)]
temp$Sex=as.numeric(temp$Sex)
kdepairs(temp)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Using the modified training data that you created in Question 4 (e.g., imputations, feature creation, etc.), test the hypothesis that the proportion of those under age 30 who survived was greater than that of those 30 and older. Use an alpha of .05. Interpret the results
Let \(X_1\) be the Normal random variable (by approximation) tracking survival proportion (\(p_1\)) of those individuals \(\le 30\) with the true proportion being \(\pi_1\).
Let \(X_2\) be the Normal random variable tracking survival proportion (\(p_2\)) of those individuals \(ge 30\) with the true proportion being \(\pi_2\).
\(H_a: \pi_1-\pi_2 \le 0\) \(H_a: \pi_1-\pi_2 > 0\) \(\alpha=.05\) \(N(p_1-p_2, \sqrt {\bar p \times (1-\bar p) \times (1/n_1+1/n_2)})\)
t1=table(mydata$Survived[mydata$Age<=30])
t2=table(mydata$Survived[mydata$Age>30])
prop.test(x=c(t1[2],t2[2]), n=c(sum(t1),sum(t2)), correct='FALSE',alternative='greater')
##
## 2-sample test for equality of proportions without continuity correction
##
## data: c(t1[2], t2[2]) out of c(sum(t1), sum(t2))
## X-squared = 1.0121, df = 1, p-value = 0.8428
## alternative hypothesis: greater
## 95 percent confidence interval:
## -0.09127836 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.3720137 0.4065574
#not statistically different
#Manually
pbar=(t1[2]+t2[2])/(sum(t1)+sum(t2))
Z=(t1[2]/sum(t1)-t2[2]/sum(t2)) / sqrt(pbar*(1-pbar)*(1/sum(t1)+1/sum(t2)))
myprint(1-pnorm(Z))
| x |
|---|
| 0.8427975 |
Since \(p\) is > \(\alpha\), we fail to reject the null. No evidence exists in this sample that the younger individuals (\(\le 30\)) survived at a higher proportion.
Provide a 98% confidence interval for the proportion of females who survived. Provide a 98% confidence interval for the proportion of males who survived. What can you glean from this analysis? Tell me what you learned.
The CI’s don’t overlap. That suggests a difference in male/female survival.
ft=table(mydata$Survived[mydata$Sex=='female'])
mt=table(mydata$Survived[mydata$Sex=='male'])
prop.test(ft[2], sum(ft), conf.level=0.98)$conf.int
## [1] 0.6791630 0.7964867
## attr(,"conf.level")
## [1] 0.98
prop.test(mt[2], sum(mt), conf.level=0.98)$conf.int
## [1] 0.1531627 0.2305662
## attr(,"conf.level")
## [1] 0.98
plot(as.factor(mydata$Survived),mydata$Sex, col=c('darkred','darkgreen'), ylab='', xlab='Survived')
Evaluate survival as a function of the classifier gender (sex) by building a table of the two. Gender is the classifier for survival, meaning that a perfect fit would have all the numbers on the diagonal of the table. Then write a function that takes this table and returns the sensitivity, specificity, positive predictive value, negative predictive value, and F1 Score. Interpret the metrics.
#I define being female as the treatment and survival as the outcome.
(myt=table(mydata$Survived,mydata$Sex))
##
## female male
## 0 81 468
## 1 233 109
myt2=addmargins(matrix(c(myt[1,2], myt[2,2], myt[1,1], myt[2,1]),nrow=2))
colnames(myt2)=c('male','female', 'sum')
rownames(myt2)=c('died', 'survived','sum')
myt2
## male female sum
## died 468 81 549
## survived 109 233 342
## sum 577 314 891
newf=function(x){
acc=(x[1,1]+x[2,2])/(x[1,1]+x[1,2]+x[2,1]+x[2,2])
sens=x[2,2]/x[2,3]
spec=x[1,1]/x[1,3]
ppv=x[2,2]/x[3,2]
npv=x[1,1]/x[3,1]
f1=2*x[2,2]/(2*x[2,2]+x[1,2]+x[2,1])
myreturn=c(acc, sens,spec,ppv,npv,f1)
names(myreturn)=c('Acc', 'Sens','Spec','PPV','NPV', 'F1')
return(myreturn)
}
newf(myt2)
## Acc Sens Spec PPV NPV F1
## 0.7867565 0.6812865 0.8524590 0.7420382 0.8110919 0.7103659
On the training set, gender alone provides 78.67% accuracy.
Customers arrive to a bank at the rate of 3 per hour distributed as Poisson. Any teller in the bank and is capable of handling 2 customers per hour distributed exponentially. Assume there are 2 tellers on duy at all time.
What is the expected wait time in the queue, and how many individuals will be waiting in line on average?
NOTE: there are 2 tellers. Officially, this is an M/M/2 queue. Just a bit of research.. https://www.johndcook.com/blog/2022/01/12/mm2/
lambda=3 #rate of arrival per hour
Mu=2 #rate of service for each teller
n=2 #number of tellers
(Wait_Time=lambda^2/(Mu*(4*Mu^2-lambda^2)))
## [1] 0.6428571
(Length_Q=lambda*Wait_Time)
## [1] 1.928571
mmc=QueueingModel(NewInput.MMC(3,2,2))
print(c(mmc$Wq, mmc$Lq))
## [1] 0.6428571 1.9285714
If there is only one teller who can handle 3 customers per hour, what is the expected length of the line?
Infinity
mm1=QueueingModel(NewInput.MM1(2.999999999999999,3))
print(c(mm1$Lq))
## [1] 3.3777e+15