setwd("C:\\Users\\USUARIO\\OneDrive - University of Texas at El Paso\\UTEP\\VII. Spring 2022\\STAT 5336 Categorical DA\\Exam 3")
getwd()
## [1] "C:/Users/USUARIO/OneDrive - University of Texas at El Paso/UTEP/VII. Spring 2022/STAT 5336 Categorical DA/Exam 3"
Example contingency table https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/xtabs
#Creating dataframe
df = data.frame(
"Name" = c("Amiya", "Rosy", "Asish"),
"Gender" = c("Male", "Female", "Male")
)
df
## Name Gender
## 1 Amiya Male
## 2 Rosy Female
## 3 Asish Male
# Rotating the data frame
newDf = t(df)
newDf
## [,1] [,2] [,3]
## Name "Amiya" "Rosy" "Asish"
## Gender "Male" "Female" "Male"
# Creating a matrix
A = matrix(
c(1, 2, 4, 1, 5, 6, 2, 4, 7),
nrow = 3,
ncol = 3
)
newTable = as.table(A)
newTable
## A B C
## A 1 1 2
## B 2 5 4
## C 4 6 7
# Converting data frame object to matrix object
modifiedDf = as.matrix(df)
newTable = as.table(modifiedDf)
newTable
## Name Gender
## A Amiya Male
## B Rosy Female
## C Asish Male
#dataTAB <-read.table("survey.dat", header=TRUE, sep= "\t")
#https://www.r-bloggers.com/2020/12/contingency-tables-in-r/
library(Rfast)
## Warning: package 'Rfast' was built under R version 4.1.3
## Loading required package: Rcpp
## Loading required package: RcppZiggurat
## Warning: package 'RcppZiggurat' was built under R version 4.1.3
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.3
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x purrr::is_integer() masks Rfast::is_integer()
## x dplyr::lag() masks stats::lag()
## x dplyr::nth() masks Rfast::nth()
## x purrr::transpose() masks Rfast::transpose()
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
#Wage dataset from the ISLR package.
Wage<-read.csv("Wage.csv")
head(Wage)
## year age sex maritl race education region
## 1 2006 18 1. Male 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic
## 2 2004 24 1. Male 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 3 2003 45 1. Male 2. Married 1. White 3. Some College 2. Middle Atlantic
## 4 2003 43 1. Male 2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 5 2005 50 1. Male 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic
## 6 2008 54 1. Male 2. Married 1. White 4. College Grad 2. Middle Atlantic
## jobclass health health_ins logwage wage
## 1 1. Industrial 1. <=Good 2. No 4.318063 75.04315
## 2 2. Information 2. >=Very Good 2. No 4.255273 70.47602
## 3 1. Industrial 1. <=Good 1. Yes 4.875061 130.98218
## 4 2. Information 2. >=Very Good 1. Yes 5.041393 154.68529
## 5 2. Information 1. <=Good 1. Yes 4.318063 75.04315
## 6 2. Information 2. >=Very Good 1. Yes 4.845098 127.11574
Wage$wage_cat<-as.factor(ifelse(Wage$wage>median(Wage$wage),"Above","Below"))
con1<-table(Wage$jobclass,Wage$wage_cat)
con1
##
## Above Below
## 1. Industrial 629 915
## 2. Information 854 602
mosaicplot(con1)
#Three-way tables
con4<-xtabs(~jobclass+wage_cat+race, data=Wage)
ftable(con4)
## race 1. White 2. Black 3. Asian 4. Other
## jobclass wage_cat
## 1. Industrial Above 558 32 36 3
## Below 767 79 50 19
## 2. Information Above 701 70 77 6
## Below 454 112 27 9
con4%>%ftable(row.vars=c("race", "jobclass"))
## wage_cat Above Below
## race jobclass
## 1. White 1. Industrial 558 767
## 2. Information 701 454
## 2. Black 1. Industrial 32 79
## 2. Information 70 112
## 3. Asian 1. Industrial 36 50
## 2. Information 77 27
## 4. Other 1. Industrial 3 19
## 2. Information 6 9
#prob by row
con4%>%ftable(row.vars=c("race", "jobclass"))%>%prop.table(margin = 1)%>%round(2)
## wage_cat Above Below
## race jobclass
## 1. White 1. Industrial 0.42 0.58
## 2. Information 0.61 0.39
## 2. Black 1. Industrial 0.29 0.71
## 2. Information 0.38 0.62
## 3. Asian 1. Industrial 0.42 0.58
## 2. Information 0.74 0.26
## 4. Other 1. Industrial 0.14 0.86
## 2. Information 0.40 0.60
#2x2x4 table where the race has 4 levels.
for (i in 1:4) {
print(odds.ratio(con4[,,i])$res[1])
}
## odds ratio
## 0.471169
## odds ratio
## 0.6481013
## odds ratio
## 0.2524675
## odds ratio
## 0.2368421
## This is already a contingency table in array form.
DF <- as.data.frame(UCBAdmissions)
## Now 'DF' is a data frame with a grid of the factors and the counts
## in variable 'Freq'.
DF
## Admit Gender Dept Freq
## 1 Admitted Male A 512
## 2 Rejected Male A 313
## 3 Admitted Female A 89
## 4 Rejected Female A 19
## 5 Admitted Male B 353
## 6 Rejected Male B 207
## 7 Admitted Female B 17
## 8 Rejected Female B 8
## 9 Admitted Male C 120
## 10 Rejected Male C 205
## 11 Admitted Female C 202
## 12 Rejected Female C 391
## 13 Admitted Male D 138
## 14 Rejected Male D 279
## 15 Admitted Female D 131
## 16 Rejected Female D 244
## 17 Admitted Male E 53
## 18 Rejected Male E 138
## 19 Admitted Female E 94
## 20 Rejected Female E 299
## 21 Admitted Male F 22
## 22 Rejected Male F 351
## 23 Admitted Female F 24
## 24 Rejected Female F 317
## Nice for taking margins ...
xtabs(Freq ~ Gender + Admit, DF)
## Admit
## Gender Admitted Rejected
## Male 1198 1493
## Female 557 1278
## And for testing independence ...
summary(xtabs(Freq ~ ., DF))
## Call: xtabs(formula = Freq ~ ., data = DF)
## Number of cases in table: 4526
## Number of factors: 3
## Test for independence of all factors:
## Chisq = 2000.3, df = 16, p-value = 0
## with NA's
DN <- DF; DN[cbind(6:9, c(1:2,4,1))] <- NA; DN
## Admit Gender Dept Freq
## 1 Admitted Male A 512
## 2 Rejected Male A 313
## 3 Admitted Female A 89
## 4 Rejected Female A 19
## 5 Admitted Male B 353
## 6 <NA> Male B 207
## 7 Admitted <NA> B 17
## 8 Rejected Female B NA
## 9 <NA> Male C 120
## 10 Rejected Male C 205
## 11 Admitted Female C 202
## 12 Rejected Female C 391
## 13 Admitted Male D 138
## 14 Rejected Male D 279
## 15 Admitted Female D 131
## 16 Rejected Female D 244
## 17 Admitted Male E 53
## 18 Rejected Male E 138
## 19 Admitted Female E 94
## 20 Rejected Female E 299
## 21 Admitted Male F 22
## 22 Rejected Male F 351
## 23 Admitted Female F 24
## 24 Rejected Female F 317
tools::assertError(# 'na.fail' should fail :
xtabs(Freq ~ Gender + Admit, DN, na.action=na.fail))
xtabs(Freq ~ Gender + Admit, DN)
## Admit
## Gender Admitted Rejected
## Male 1078 1286
## Female 540 1270
xtabs(Freq ~ Gender + Admit, DN, na.action = na.pass)
## Admit
## Gender Admitted Rejected
## Male 1078 1286
## Female 540
## The Female:Rejected combination has NA 'Freq' (and NA prints 'invisibly' as "")
xtabs(Freq ~ Gender + Admit, DN, addNA = TRUE) # ==> count NAs
## Admit
## Gender Admitted Rejected <NA>
## Male 1078 1286 327
## Female 540 0
## <NA> 17 0 0
## Create a nice display for the warp break data.
warpbreaks$replicate <- rep_len(1:9, 54)
ftable(xtabs(breaks ~ wool + tension + replicate, data = warpbreaks))
## replicate 1 2 3 4 5 6 7 8 9
## wool tension
## A L 26 30 54 25 70 52 51 26 67
## M 18 21 29 17 12 18 35 30 36
## H 36 21 24 18 10 43 28 15 26
## B L 27 14 29 19 29 31 41 20 44
## M 42 26 19 16 39 28 21 39 29
## H 20 21 24 17 13 15 15 16 28
### ---- Sparse Examples ----
d.ergo <- data.frame(Type = paste0("T", rep(1:4, 9*4)),
Subj = gl(9, 4, 36*4))
d.ergo
## Type Subj
## 1 T1 1
## 2 T2 1
## 3 T3 1
## 4 T4 1
## 5 T1 2
## 6 T2 2
## 7 T3 2
## 8 T4 2
## 9 T1 3
## 10 T2 3
## 11 T3 3
## 12 T4 3
## 13 T1 4
## 14 T2 4
## 15 T3 4
## 16 T4 4
## 17 T1 5
## 18 T2 5
## 19 T3 5
## 20 T4 5
## 21 T1 6
## 22 T2 6
## 23 T3 6
## 24 T4 6
## 25 T1 7
## 26 T2 7
## 27 T3 7
## 28 T4 7
## 29 T1 8
## 30 T2 8
## 31 T3 8
## 32 T4 8
## 33 T1 9
## 34 T2 9
## 35 T3 9
## 36 T4 9
## 37 T1 1
## 38 T2 1
## 39 T3 1
## 40 T4 1
## 41 T1 2
## 42 T2 2
## 43 T3 2
## 44 T4 2
## 45 T1 3
## 46 T2 3
## 47 T3 3
## 48 T4 3
## 49 T1 4
## 50 T2 4
## 51 T3 4
## 52 T4 4
## 53 T1 5
## 54 T2 5
## 55 T3 5
## 56 T4 5
## 57 T1 6
## 58 T2 6
## 59 T3 6
## 60 T4 6
## 61 T1 7
## 62 T2 7
## 63 T3 7
## 64 T4 7
## 65 T1 8
## 66 T2 8
## 67 T3 8
## 68 T4 8
## 69 T1 9
## 70 T2 9
## 71 T3 9
## 72 T4 9
## 73 T1 1
## 74 T2 1
## 75 T3 1
## 76 T4 1
## 77 T1 2
## 78 T2 2
## 79 T3 2
## 80 T4 2
## 81 T1 3
## 82 T2 3
## 83 T3 3
## 84 T4 3
## 85 T1 4
## 86 T2 4
## 87 T3 4
## 88 T4 4
## 89 T1 5
## 90 T2 5
## 91 T3 5
## 92 T4 5
## 93 T1 6
## 94 T2 6
## 95 T3 6
## 96 T4 6
## 97 T1 7
## 98 T2 7
## 99 T3 7
## 100 T4 7
## 101 T1 8
## 102 T2 8
## 103 T3 8
## 104 T4 8
## 105 T1 9
## 106 T2 9
## 107 T3 9
## 108 T4 9
## 109 T1 1
## 110 T2 1
## 111 T3 1
## 112 T4 1
## 113 T1 2
## 114 T2 2
## 115 T3 2
## 116 T4 2
## 117 T1 3
## 118 T2 3
## 119 T3 3
## 120 T4 3
## 121 T1 4
## 122 T2 4
## 123 T3 4
## 124 T4 4
## 125 T1 5
## 126 T2 5
## 127 T3 5
## 128 T4 5
## 129 T1 6
## 130 T2 6
## 131 T3 6
## 132 T4 6
## 133 T1 7
## 134 T2 7
## 135 T3 7
## 136 T4 7
## 137 T1 8
## 138 T2 8
## 139 T3 8
## 140 T4 8
## 141 T1 9
## 142 T2 9
## 143 T3 9
## 144 T4 9
xtabs(~ Type + Subj, data = d.ergo) # 4 replicates each
## Subj
## Type 1 2 3 4 5 6 7 8 9
## T1 4 4 4 4 4 4 4 4 4
## T2 4 4 4 4 4 4 4 4 4
## T3 4 4 4 4 4 4 4 4 4
## T4 4 4 4 4 4 4 4 4 4
set.seed(15) # a subset of cases:
xtabs(~ Type + Subj, data = d.ergo[sample(36, 10), ], sparse = TRUE)
## 4 x 9 sparse Matrix of class "dgCMatrix"
## Subj
## Type 1 2 3 4 5 6 7 8 9
## T1 . 1 . . . 1 . . .
## T2 . . . . . . . 1 1
## T3 . . 1 1 1 . 1 1 .
## T4 . 1 . . . . . . .
## Hypothetical two-level setup:
inner <- factor(sample(letters[1:25], 100, replace = TRUE))
inout <- factor(sample(LETTERS[1:5], 25, replace = TRUE))
fr <- data.frame(inner = inner, outer = inout[as.integer(inner)])
xtabs(~ inner + outer, fr, sparse = TRUE)
## 25 x 4 sparse Matrix of class "dgCMatrix"
## outer
## inner A B C E
## a . . . 3
## b . . . 6
## c . . 5 .
## d . 1 . .
## e . . 3 .
## f . . 5 .
## g . . 3 .
## h 6 . . .
## i 8 . . .
## j . 1 . .
## k . . . 6
## l . 2 . .
## m . . . 7
## n . . 3 .
## o . 3 . .
## p . 3 . .
## q . . . 7
## r . . . 2
## s . 6 . .
## t . . . 5
## u 4 . . .
## v . . 3 .
## w . 3 . .
## x . . 3 .
## y 2 . . .
#SENSITITY and specificity
Fiberbits <- read.csv("C:\\Amrita\\Datavedi\\Fiberbits\\Fiberbits.csv")
Fiberbits_model_1<-glm(active_cust~., family=binomial, data=Fiberbits)
threshold=0.5
predicted_values<-ifelse(predict(Fiberbits_model_1,type="response")>threshold,1,0)
actual_values<-Fiberbits_model_1$y
conf_matrix<-table(predicted_values,actual_values)
conf_matrix
library(caret)
sensitivity(conf_matrix)
specificity(conf_matrix)
sensit=(TP+FN)/YES
#TP=TRUE POSITIVES
specif=(TN+FP)/NO
ROC curve
f we consider all the possible threshold values and the corresponding specificity and sensitivity rate what will be the final model accuracy. ROC(Receiver operating characteristic) curve is drawn by taking False positive rate on X-axis and True positive rate on Y- axis ROC tells us, how many mistakes are we making to identify all the positives? https://statinfer.com/203-4-3-roc-and-auc/
library(pROC)
predicted_prob<-predict(prod_sales_Logit_model,type="response")
roccurve <- roc(prod_sales_Logit_model$y, predicted_prob)
plot(roccurve)
auc(roccurve) ### Area under the curve: 0.983 the closer to one the betterthe model is
auc(prod_sales_Logit_model$y, predicted_prob) #another way
predicted_prob<-predict(Fiberbits_model_1,type="response")
roccurve <- roc(Fiberbits_model_1$y, predicted_prob)
plot(roccurve)
Ahother example of specificity and ROC
## Not run:
###################
## 2 class example
lvs <- c("normal", "abnormal")
truth <- factor(rep(lvs, times = c(86, 258)),
levels = rev(lvs))
pred <- factor(
c(
rep(lvs, times = c(54, 32)),
rep(lvs, times = c(27, 231))),
levels = rev(lvs))
pred
## [1] normal normal normal normal normal normal normal normal
## [9] normal normal normal normal normal normal normal normal
## [17] normal normal normal normal normal normal normal normal
## [25] normal normal normal normal normal normal normal normal
## [33] normal normal normal normal normal normal normal normal
## [41] normal normal normal normal normal normal normal normal
## [49] normal normal normal normal normal normal abnormal abnormal
## [57] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [65] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [73] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [81] abnormal abnormal abnormal abnormal abnormal abnormal normal normal
## [89] normal normal normal normal normal normal normal normal
## [97] normal normal normal normal normal normal normal normal
## [105] normal normal normal normal normal normal normal normal
## [113] normal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [121] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [129] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [137] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [145] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [153] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [161] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [169] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [177] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [185] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [193] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [201] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [209] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [217] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [225] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [233] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [241] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [249] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [257] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [265] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [273] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [281] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [289] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [297] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [305] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [313] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [321] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [329] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## [337] abnormal abnormal abnormal abnormal abnormal abnormal abnormal abnormal
## Levels: abnormal normal
xtab <- table(pred, truth)
xtab
## truth
## pred abnormal normal
## abnormal 231 32
## normal 27 54
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
sensitivity(pred, truth)
## [1] 0.8953488
sensitivity(xtab)
## [1] 0.8953488
posPredValue(pred, truth)
## [1] 0.878327
posPredValue(pred, truth, prevalence = 0.25)
## [1] 0.4450867
specificity(pred, truth)
## [1] 0.627907
negPredValue(pred, truth)
## [1] 0.6666667
negPredValue(xtab)
## [1] 0.6666667
negPredValue(pred, truth, prevalence = 0.25)
## [1] 0.9473684
prev <- seq(0.001, .99, length = 20)
npvVals <- ppvVals <- prev * NA
for(i in seq(along = prev))
{
ppvVals[i] <- posPredValue(pred, truth, prevalence = prev[i])
npvVals[i] <- negPredValue(pred, truth, prevalence = prev[i])
}
plot(prev, ppvVals,
ylim = c(0, 1),
type = "l",
ylab = "",
xlab = "Prevalence (i.e. prior)")
points(prev, npvVals, type = "l", col = "red")
abline(h=sensitivity(pred, truth), lty = 2)
abline(h=specificity(pred, truth), lty = 2, col = "red")
legend(.5, .5,
c("ppv", "npv", "sens", "spec"),
col = c("black", "red", "black", "red"),
lty = c(1, 1, 2, 2))
###################
## 3 class example
library(MASS)
fit <- lda(Species ~ ., data = iris)
model <- predict(fit)$class
irisTabs <- table(model, iris$Species)
## When passing factors, an error occurs with more
## than two levels
#sensitivity(model, iris$Species)
## When passing a table, more than two levels can
## be used
sensitivity(irisTabs, "versicolor")
## [1] 0.96
specificity(irisTabs, c("setosa", "virginica"))
## [1] 0.99
## End(Not run)