library('DescTools')
Desc(d.pizza[,c("driver","temperature","count","weekday","wine_ordered","date")], plotit=TRUE)
##
## -------------------------------------------------------------------------
## 'data.frame': 1209 obs. of 6 variables:
## 1 $ driver : Factor w/ 7 levels "Butcher","Carpenter",..: 7 1 1 7 3 7 7 7 7 3 ...
## 2 $ temperature : num 53 56.4 36.5 NA 50 27 33.9 54.8 48 54.4 ...
## 3 $ count : int 5 2 3 2 5 1 4 NA 3 6 ...
## 4 $ weekday : num 6 6 6 6 6 6 6 6 6 6 ...
## 5 $ wine_ordered: int 0 0 0 0 0 0 1 NA 0 1 ...
## 6 $ date : Date, format: "2014-03-01" "2014-03-01" ...
##
## -------------------------------------------------------------------------
## 1 - driver (factor)
##
## length n NAs levels unique dupes
## 1'209 1'204 5 7 7 y
##
## level freq perc cumfreq cumperc
## 1 Carpenter 272 .226 272 .226
## 2 Carter 234 .194 506 .420
## 3 Taylor 204 .169 710 .590
## 4 Hunter 156 .130 866 .719
## 5 Miller 125 .104 991 .823
## 6 Farmer 117 .097 1108 .920
## 7 Butcher 96 .080 1204 1.000
## -------------------------------------------------------------------------
## 2 - temperature (numeric)
##
## length n NAs unique 0s mean meanSE
## 1'209 1'170 39 375 0 47.937 0.291
##
## .05 .10 .25 median .75 .90 .95
## 26.700 33.290 42.225 50 55.300 58.800 60.500
##
## rng sd vcoef mad IQR skew kurt
## 45.500 9.938 0.207 9.192 13.075 -0.842 0.051
##
## lowest : 19.3, 19.4, 20, 20.2 (2), 20.35
## highest: 63.8, 64.1, 64.6, 64.7, 64.8
##
## Shapiro-Wilks normality test p.value : <2e-16
## -------------------------------------------------------------------------
## 3 - count (integer)
##
## length n NAs unique 0s mean meanSE
## 1'209 1'197 12 8 0 3.444 0.045
##
## .05 .10 .25 median .75 .90 .95
## 1 2 2 3 4 6 6
##
## rng sd vcoef mad IQR skew kurt
## 7 1.556 0.452 1.483 2 0.454 -0.363
##
## Shapiro-Wilks normality test p.value : <2e-16
##
##
## level freq perc cumfreq cumperc
## 1 1 108 .090 108 .090
## 2 2 259 .216 367 .307
## 3 3 300 .251 667 .557
## 4 4 240 .201 907 .758
## 5 5 152 .127 1059 .885
## 6 6 97 .081 1156 .966
## 7 7 34 .028 1190 .994
## 8 8 7 .006 1197 1.000
## -------------------------------------------------------------------------
## 4 - weekday (numeric)
##
## length n NAs unique 0s mean meanSE
## 1'209 1'177 32 7 0 4.441 0.059
##
## .05 .10 .25 median .75 .90 .95
## 1 1 3 5 6 7 7
##
## rng sd vcoef mad IQR skew kurt
## 6 2.019 0.455 2.965 3 -0.345 -1.170
##
## lowest : 1 (144), 2 (117), 3 (134), 4 (147), 5 (171)
## highest: 3 (134), 4 (147), 5 (171), 6 (244), 7 (220)
##
## Shapiro-Wilks normality test p.value : <2e-16
## -------------------------------------------------------------------------
## 5 - wine_ordered (integer - dichotomous)
##
## length n NAs unique
## 1'209 1'197 12 2
##
## freq perc lci.95 uci.95'
## 0 1010 .844 .822 .863
## 1 187 .156 .137 .178
##
## ' 95%-CI Wilson
## -------------------------------------------------------------------------
## 6 - date (Date)
##
## length n NAs unique
## 1'209 1'177 32 31
##
## lowest : 2014-03-01 (42), 2014-03-02 (46), 2014-03-03 (26), 2014-03-04 (19)
## highest: 2014-03-28 (46), 2014-03-29 (53), 2014-03-30 (43), 2014-03-31 (34)
##
##
## Weekdays:
## level freq perc cumfreq cumperc exp res
## 1 Monday 144 .122 144 .122 168.1 -1.9
## 2 Tuesday 117 .099 261 .222 168.1 -3.9
## 3 Wednesday 134 .114 395 .336 168.1 -2.6
## 4 Thursday 147 .125 542 .460 168.1 -1.6
## 5 Friday 171 .145 713 .606 168.1 .2
## 6 Saturday 244 .207 957 .813 168.1 5.9
## 7 Sunday 220 .187 1177 1.000 168.1 4.0
##
## Chi-squared test for given probabilities
##
## data: table(xd)
## X-squared = 78.88, df = 6, p-value = 6.09e-15
##
##
## Months:
## level freq perc cumfreq cumperc exp prs.res
## 1 January 0 0 0 0 99.7 -10.0
## 2 February 0 0 0 0 93.3 -9.7
## 3 March 1177 1 1177 1 99.7 107.9
## 4 April 0 0 1177 1 96.5 -9.8
## 5 May 0 0 1177 1 99.7 -10.0
## 6 June 0 0 1177 1 96.5 -9.8
## 7 July 0 0 1177 1 99.7 -10.0
## 8 August 0 0 1177 1 99.7 -10.0
## 9 September 0 0 1177 1 96.5 -9.8
## 10 October 0 0 1177 1 99.7 -10.0
## 11 November 0 0 1177 1 96.5 -9.8
## 12 December 0 0 1177 1 99.7 -10.0
##
## Chi-squared test for given probabilities
##
## data: tab
## X-squared = 12719, df = 11, p-value < 2.2e-16
##
##
## Table by days :
## level freq perc cumfreq cumperc
## 1 2014-03-01 42 .036 42 .036
## 2 2014-03-02 46 .039 88 .075
## 3 2014-03-03 26 .022 114 .097
## 4 2014-03-04 19 .016 133 .113
## 5 2014-03-05 33 .028 166 .141
## 6 2014-03-06 39 .033 205 .174
## 7 2014-03-07 44 .037 249 .212
## 8 2014-03-08 55 .047 304 .258
## 9 2014-03-09 42 .036 346 .294
## 10 2014-03-10 26 .022 372 .316
## 11 2014-03-11 34 .029 406 .345
## 12 2014-03-12 36 .031 442 .376
## 13 2014-03-13 35 .030 477 .405
## 14 2014-03-14 38 .032 515 .438
## 15 2014-03-15 48 .041 563 .478
## 16 2014-03-16 47 .040 610 .518
## 17 2014-03-17 30 .025 640 .544
## 18 2014-03-18 32 .027 672 .571
## 19 2014-03-19 31 .026 703 .597
## 20 2014-03-20 36 .031 739 .628
## 21 2014-03-21 43 .037 782 .664
## 22 2014-03-22 46 .039 828 .703
## 23 2014-03-23 42 .036 870 .739
## 24 2014-03-24 28 .024 898 .763
## 25 2014-03-25 32 .027 930 .790
## 26 2014-03-26 34 .029 964 .819
## 27 2014-03-27 37 .031 1001 .850
## 28 2014-03-28 46 .039 1047 .890
## 29 2014-03-29 53 .045 1100 .935
## 30 2014-03-30 43 .037 1143 .971
## 31 2014-03-31 34 .029 1177 1.000
Freq(d.pizza$temperature)
## level freq perc cumfreq cumperc
## 1 [15,20] 3 0.003 3 0.003
## 2 (20,25] 30 0.026 33 0.028
## 3 (25,30] 58 0.050 91 0.078
## 4 (30,35] 48 0.041 139 0.119
## 5 (35,40] 100 0.085 239 0.204
## 6 (40,45] 130 0.111 369 0.315
## 7 (45,50] 219 0.187 588 0.503
## 8 (50,55] 268 0.229 856 0.732
## 9 (55,60] 241 0.206 1097 0.938
## 10 (60,65] 73 0.062 1170 1.000
hist(d.pizza$temperature)
A numeric variable vs. a categorical is best described by group wise measures.
Desc(temperature ~ driver, d.pizza, digits=1, plotit=TRUE)
##
## Call:
## Desc.formula(temperature ~ driver, d.pizza, digits = 1, plotit = TRUE)
##
## -------------------------------------------------------------------------
## temperature ~ driver
##
## Summary:
## n pairs: 1'209, valid: 1'166 (96%), missings: 43 (4%), groups: 7
##
##
## Butcher Carpenter Carter Farmer Hunter Miller
## mean 49.6 43.5' 50.4 50.9 52.1" 47.5
## median 51.4 44.8' 51.8 54.1 55.1" 49.6
## sd 8.8 9.4 8.5 9.0 8.9 8.9
## IQR 12.0 12.5 11.3 11.2 11.6 8.8
## n 96 253 226 117 156 121
## np 0.082 0.217 0.194 0.100 0.134 0.104
## NAs 0 19 8 0 0 4
## 0s 0 0 0 0 0 0
##
## Taylor
## mean 45.1
## median 48.5
## sd 11.4
## IQR 18.4
## n 197
## np 0.169
## NAs 7
## 0s 0
##
## ' min, " max
##
## Kruskal-Wallis rank sum test:
## Kruskal-Wallis chi-squared = 141.9, df = 6, p-value < 2.2e-16
## Warning:
## Grouping variable contains 5 NAs (0.414%).
Two categorical variables are described by a contingency table.
# a chunk unevaluated??
Desc(operator ~ area, d.pizza, plotit=TRUE)
##
## Call:
## Desc.formula(operator ~ area, d.pizza, plotit = TRUE)
##
## -------------------------------------------------------------------------
## operator ~ area
##
## Summary:
## n: 1191, rows: 3, columns: 3
##
## Pearson's Chi-squared test:
## X-squared = 17.91, df = 4, p-value = 0.001288
## Likelihood Ratio:
## X-squared = 18.0986, df = 4, p-value = 0.001181
## Mantel-Haenszel Chi-squared:
## X-squared = 8.6654, df = 1, p-value = 0.003243
##
## Phi-Coefficient 0.123
## Contingency Coeff. 0.122
## Cramer's V 0.087
##
## area Brent Camden Westminster Sum
## operator
## Allanah freq 153 123 89 365
## perc .128 .103 .075 .306
## p.row .419 .337 .244 1.000
## p.col .323 .362 .235 .306
## Maria freq 153 108 122 383
## perc .128 .091 .102 .322
## p.row .399 .282 .319 1.000
## p.col .323 .318 .323 .322
## Rhonda freq 167 109 167 443
## perc .140 .092 .140 .372
## p.row .377 .246 .377 1.000
## p.col .353 .321 .442 .372
## Sum freq 473 340 378 1191
## perc .397 .285 .317 1.000
## p.row .397 .285 .317 1.000
## p.col 1.000 1.000 1.000 1.000
Two numerical variables have no obvious standard description, report simple correlation coefficients (Pearson, Spearman and Kendall).
Desc(temperature ~ delivery_min, d.pizza, plotit=TRUE)
##
## Call:
## Desc.formula(temperature ~ delivery_min, d.pizza, plotit = TRUE)
##
## -------------------------------------------------------------------------
## temperature ~ delivery_min
##
## Summary:
## n pairs: 1'209, valid: 1'170 (97%), missings: 39 (3%)
##
##
## Pearson corr. : -0.575
## Spearman corr.: -0.573
## Kendall corr. : -0.422
Frequencies and the percentages:
# A)
PercTable(d.pizza$driver, d.pizza$city, margins=c(1,2)
, rfrq="101")
## freq perc
##
## Butcher 96 .080
## Carpenter 272 .226
## Carter 234 .194
## Farmer 117 .097
## Hunter 156 .130
## Miller 125 .104
## Taylor 204 .169
## Sum 1204 1.000
Expected values and standardized residuals:
# B)
PercTable(d.pizza$driver, d.pizza$city, margins=c(1,2)
, rfrq="000", expected=TRUE, stdres=TRUE
, digits=1)
## freq zero zero
##
## Butcher 96 172.0 . -6.3 .
## Carpenter 272 172.0 . 8.2 .
## Carter 234 172.0 . 5.1 .
## Farmer 117 172.0 . -4.5 .
## Hunter 156 172.0 . -1.3 .
## Miller 125 172.0 . -3.9 .
## Taylor 204 172.0 . 2.6 .
## Sum 1204 . . . .
pain <- as.table(matrix(c(26,26,23,18,9,6,7,9,14,23)
, nrow=2, byrow=TRUE,
dimnames=list(treat=c("No","Yes"), strength=0:4))
)
The verbosity can be set to ‘medium’ ‘low’ and ‘high’:
Desc(pain, verb="high", plotit = TRUE)
## -------------------------------------------------------------------------
## pain (table)
##
## Summary:
## n: 161, rows: 2, columns: 5
##
## Pearson's Chi-squared test:
## X-squared = 26.6, df = 4, p-value = 2.392e-05
## Likelihood Ratio:
## X-squared = 26.6689, df = 4, p-value = 2.319e-05
## Mantel-Haenszel Chi-squared:
## X-squared = 22.8188, df = 1, p-value = 1.78e-06
## Warning: Cannot compute exact p-value with ties
## estimate lwr.ci upr.ci
## Phi Coeff. 0.4065 - -
## Contingency Coeff. 0.3766 - -
## Cramer V 0.4065 0.2212 0.5411
## Goodman Kruskal Gamma 0.5313 0.3480 0.7146
## Kendall Tau-b 0.3373 0.2114 0.4631
## Stuart Tau-c 0.4111 0.2547 0.5675
## Somers D C|R 0.4427 0.2786 0.6068
## Somers D R|C 0.2569 0.1593 0.3546
## Pearson Correlation 0.3776 0.2368 0.5029
## Spearman Correlation 0.3771 0.2362 0.5024
## Lambda C|R 0.1250 0.0000 0.2547
## Lambda R|C 0.2373 0.0732 0.4014
## Lambda sym 0.1604 0.0388 0.2821
## Uncertainty Coeff. C|R 0.0515 0.0140 0.0890
## Uncertainty Coeff. R|C 0.1261 0.0346 0.2175
## Uncertainty Coeff. sym 0.0731 0.0199 0.1262
## Mutual Information 0.1195 - -
##
## strength 0 1 2 3 4 Sum
## treat
## No freq 26 26 23 18 9 102
## perc .161 .161 .143 .112 .056 .634
## p.row .255 .255 .225 .176 .088 1.000
## p.col .812 .788 .719 .562 .281 .634
## Yes freq 6 7 9 14 23 59
## perc .037 .043 .056 .087 .143 .366
## p.row .102 .119 .153 .237 .390 1.000
## p.col .188 .212 .281 .438 .719 .366
## Sum freq 32 33 32 32 32 161
## perc .199 .205 .199 .199 .199 1.000
## p.row .199 .205 .199 .199 .199 1.000
## p.col 1.000 1.000 1.000 1.000 1.000 1.000
Desc(pain, verb="low", plotit = TRUE)
## -------------------------------------------------------------------------
## pain (table)
##
## Summary:
## n: 161, rows: 2, columns: 5
##
## Pearson's Chi-squared test:
## X-squared = 26.6, df = 4, p-value = 2.392e-05
##
## strength 0 1 2 3 4 Sum
## treat
## No 26 26 23 18 9 102
## Yes 6 7 9 14 23 59
## Sum 32 33 32 32 32 161
heart <- as.table(matrix(c(11,4,2,6), nrow=2, byrow=TRUE,
dimnames=list(Cholesterol=c("High","Low"), Response=c("Yes","No"))))
Desc(heart, horiz=FALSE, plotit = TRUE)
## -------------------------------------------------------------------------
## heart (table)
##
## Summary:
## n: 23, rows: 2, columns: 2
##
## Fisher's exact test p-value = 0.03931
## McNemar's chi-squared = 0.1667, df = 1, p-value = 0.6831
##
## estimate lwr.ci upr.ci
##
## odds ratio 8.250 1.154 59.003
## rel. risk (col1) 2.933 0.850 10.120
## rel. risk (col2) 0.356 0.140 0.901
##
## Phi-Coefficient 0.464
## Contingency Coeff. 0.421
## Cramer's V 0.464
##
## Response Yes No Sum
## Cholesterol
## High freq 11 4 15
## perc .478 .174 .652
## p.row .733 .267 1.000
## p.col .846 .400 .652
## Low freq 2 6 8
## perc .087 .261 .348
## p.row .250 .750 1.000
## p.col .154 .600 .348
## Sum freq 13 10 23
## perc .565 .435 1.000
## p.row .565 .435 1.000
## p.col 1.000 1.000 1.000