DescTools

Load DescTools

library('DescTools')

Describe variables in a data frame:

Desc(d.pizza[,c("driver","temperature","count","weekday","wine_ordered","date")], plotit=TRUE)

## 
## -------------------------------------------------------------------------
## 'data.frame':    1209 obs. of  6 variables:
##  1 $ driver      : Factor w/ 7 levels "Butcher","Carpenter",..: 7 1 1 7 3 7 7 7 7 3 ...
##  2 $ temperature : num  53 56.4 36.5 NA 50 27 33.9 54.8 48 54.4 ...
##  3 $ count       : int  5 2 3 2 5 1 4 NA 3 6 ...
##  4 $ weekday     : num  6 6 6 6 6 6 6 6 6 6 ...
##  5 $ wine_ordered: int  0 0 0 0 0 0 1 NA 0 1 ...
##  6 $ date        : Date, format: "2014-03-01" "2014-03-01" ...
## 
## ------------------------------------------------------------------------- 
## 1 - driver (factor)
## 
##   length      n    NAs levels unique  dupes
##    1'209  1'204      5      7      7      y
## 
##       level freq  perc cumfreq cumperc
## 1 Carpenter  272  .226     272    .226
## 2    Carter  234  .194     506    .420
## 3    Taylor  204  .169     710    .590
## 4    Hunter  156  .130     866    .719
## 5    Miller  125  .104     991    .823
## 6    Farmer  117  .097    1108    .920
## 7   Butcher   96  .080    1204   1.000

plot of chunk unnamed-chunk-2

## ------------------------------------------------------------------------- 
## 2 - temperature (numeric)
## 
##   length      n    NAs unique     0s   mean meanSE
##    1'209  1'170     39    375      0 47.937  0.291
## 
##      .05    .10    .25 median    .75    .90    .95
##   26.700 33.290 42.225     50 55.300 58.800 60.500
## 
##      rng     sd  vcoef    mad    IQR   skew   kurt
##   45.500  9.938  0.207  9.192 13.075 -0.842  0.051
##  
## lowest : 19.3, 19.4, 20, 20.2 (2), 20.35
## highest: 63.8, 64.1, 64.6, 64.7, 64.8
## 
## Shapiro-Wilks normality test  p.value : <2e-16

plot of chunk unnamed-chunk-2

## ------------------------------------------------------------------------- 
## 3 - count (integer)
## 
##   length      n    NAs unique     0s   mean meanSE
##    1'209  1'197     12      8      0  3.444  0.045
## 
##      .05    .10    .25 median    .75    .90    .95
##        1      2      2      3      4      6      6
## 
##      rng     sd  vcoef    mad    IQR   skew   kurt
##        7  1.556  0.452  1.483      2  0.454 -0.363
##  
## Shapiro-Wilks normality test  p.value : <2e-16 
## 
## 
##   level freq  perc cumfreq cumperc
## 1     1  108  .090     108    .090
## 2     2  259  .216     367    .307
## 3     3  300  .251     667    .557
## 4     4  240  .201     907    .758
## 5     5  152  .127    1059    .885
## 6     6   97  .081    1156    .966
## 7     7   34  .028    1190    .994
## 8     8    7  .006    1197   1.000

plot of chunk unnamed-chunk-2

## ------------------------------------------------------------------------- 
## 4 - weekday (numeric)
## 
##   length      n    NAs unique     0s   mean meanSE
##    1'209  1'177     32      7      0  4.441  0.059
## 
##      .05    .10    .25 median    .75    .90    .95
##        1      1      3      5      6      7      7
## 
##      rng     sd  vcoef    mad    IQR   skew   kurt
##        6  2.019  0.455  2.965      3 -0.345 -1.170
##  
## lowest : 1 (144), 2 (117), 3 (134), 4 (147), 5 (171)
## highest: 3 (134), 4 (147), 5 (171), 6 (244), 7 (220)
## 
## Shapiro-Wilks normality test  p.value : <2e-16

plot of chunk unnamed-chunk-2

## ------------------------------------------------------------------------- 
## 5 - wine_ordered (integer - dichotomous)
## 
##   length      n    NAs unique
##    1'209  1'197     12      2
##  
##   freq  perc lci.95 uci.95'
## 0 1010  .844   .822   .863
## 1  187  .156   .137   .178
## 
## ' 95%-CI Wilson

plot of chunk unnamed-chunk-2

## ------------------------------------------------------------------------- 
## 6 - date (Date)
## 
##   length      n    NAs unique
##    1'209  1'177     32     31
##  
## lowest : 2014-03-01 (42), 2014-03-02 (46), 2014-03-03 (26), 2014-03-04 (19)
## highest: 2014-03-28 (46), 2014-03-29 (53), 2014-03-30 (43), 2014-03-31 (34)
## 
## 
## Weekdays:
##       level freq  perc cumfreq cumperc   exp  res
## 1    Monday  144  .122     144    .122 168.1 -1.9
## 2   Tuesday  117  .099     261    .222 168.1 -3.9
## 3 Wednesday  134  .114     395    .336 168.1 -2.6
## 4  Thursday  147  .125     542    .460 168.1 -1.6
## 5    Friday  171  .145     713    .606 168.1   .2
## 6  Saturday  244  .207     957    .813 168.1  5.9
## 7    Sunday  220  .187    1177   1.000 168.1  4.0
## 
##  Chi-squared test for given probabilities
## 
## data:  table(xd)
## X-squared = 78.88, df = 6, p-value = 6.09e-15
## 
## 
## Months:
##        level freq perc cumfreq cumperc  exp prs.res
## 1    January    0    0       0       0 99.7   -10.0
## 2   February    0    0       0       0 93.3    -9.7
## 3      March 1177    1    1177       1 99.7   107.9
## 4      April    0    0    1177       1 96.5    -9.8
## 5        May    0    0    1177       1 99.7   -10.0
## 6       June    0    0    1177       1 96.5    -9.8
## 7       July    0    0    1177       1 99.7   -10.0
## 8     August    0    0    1177       1 99.7   -10.0
## 9  September    0    0    1177       1 96.5    -9.8
## 10   October    0    0    1177       1 99.7   -10.0
## 11  November    0    0    1177       1 96.5    -9.8
## 12  December    0    0    1177       1 99.7   -10.0
## 
##  Chi-squared test for given probabilities
## 
## data:  tab
## X-squared = 12719, df = 11, p-value < 2.2e-16
## 
## 
## Table by days :
##         level freq  perc cumfreq cumperc
## 1  2014-03-01   42  .036      42    .036
## 2  2014-03-02   46  .039      88    .075
## 3  2014-03-03   26  .022     114    .097
## 4  2014-03-04   19  .016     133    .113
## 5  2014-03-05   33  .028     166    .141
## 6  2014-03-06   39  .033     205    .174
## 7  2014-03-07   44  .037     249    .212
## 8  2014-03-08   55  .047     304    .258
## 9  2014-03-09   42  .036     346    .294
## 10 2014-03-10   26  .022     372    .316
## 11 2014-03-11   34  .029     406    .345
## 12 2014-03-12   36  .031     442    .376
## 13 2014-03-13   35  .030     477    .405
## 14 2014-03-14   38  .032     515    .438
## 15 2014-03-15   48  .041     563    .478
## 16 2014-03-16   47  .040     610    .518
## 17 2014-03-17   30  .025     640    .544
## 18 2014-03-18   32  .027     672    .571
## 19 2014-03-19   31  .026     703    .597
## 20 2014-03-20   36  .031     739    .628
## 21 2014-03-21   43  .037     782    .664
## 22 2014-03-22   46  .039     828    .703
## 23 2014-03-23   42  .036     870    .739
## 24 2014-03-24   28  .024     898    .763
## 25 2014-03-25   32  .027     930    .790
## 26 2014-03-26   34  .029     964    .819
## 27 2014-03-27   37  .031    1001    .850
## 28 2014-03-28   46  .039    1047    .890
## 29 2014-03-29   53  .045    1100    .935
## 30 2014-03-30   43  .037    1143    .971
## 31 2014-03-31   34  .029    1177   1.000

plot of chunk unnamed-chunk-2

Simple frequencies:

Freq(d.pizza$temperature)

##      level freq  perc cumfreq cumperc
## 1  [15,20]    3 0.003       3   0.003
## 2  (20,25]   30 0.026      33   0.028
## 3  (25,30]   58 0.050      91   0.078
## 4  (30,35]   48 0.041     139   0.119
## 5  (35,40]  100 0.085     239   0.204
## 6  (40,45]  130 0.111     369   0.315
## 7  (45,50]  219 0.187     588   0.503
## 8  (50,55]  268 0.229     856   0.732
## 9  (55,60]  241 0.206    1097   0.938
## 10 (60,65]   73 0.062    1170   1.000

hist(d.pizza$temperature)

plot of chunk unnamed-chunk-5

Pairwise descriptions:

A numeric variable vs. a categorical is best described by group wise measures.

Desc(temperature ~ driver, d.pizza, digits=1, plotit=TRUE)

## 
## Call:
## Desc.formula(temperature ~ driver, d.pizza, digits = 1, plotit = TRUE)
## 
## ------------------------------------------------------------------------- 
## temperature ~ driver
## 
## Summary: 
## n pairs: 1'209, valid: 1'166 (96%), missings: 43 (4%), groups: 7
## 
## 
##           Butcher  Carpenter     Carter     Farmer     Hunter     Miller  
## mean         49.6       43.5'      50.4       50.9       52.1"      47.5  
## median       51.4       44.8'      51.8       54.1       55.1"      49.6  
## sd            8.8        9.4        8.5        9.0        8.9        8.9  
## IQR          12.0       12.5       11.3       11.2       11.6        8.8  
## n              96        253        226        117        156        121  
## np          0.082      0.217      0.194      0.100      0.134      0.104  
## NAs             0         19          8          0          0          4  
## 0s              0          0          0          0          0          0  
## 
##            Taylor 
## mean         45.1 
## median       48.5 
## sd           11.4 
## IQR          18.4 
## n             197 
## np          0.169 
## NAs             7 
## 0s              0 
## 
## ' min, " max
## 
## Kruskal-Wallis rank sum test:
##   Kruskal-Wallis chi-squared = 141.9, df = 6, p-value < 2.2e-16
## Warning:
##   Grouping variable contains 5 NAs (0.414%).

plot of chunk unnamed-chunk-6

Two categorical variables are described by a contingency table.

# a chunk unevaluated??
Desc(operator ~ area, d.pizza, plotit=TRUE)

## 
## Call:
## Desc.formula(operator ~ area, d.pizza, plotit = TRUE)
## 
## ------------------------------------------------------------------------- 
## operator ~ area
## 
## Summary: 
## n: 1191, rows: 3, columns: 3
## 
## Pearson's Chi-squared test:
##   X-squared = 17.91, df = 4, p-value = 0.001288
## Likelihood Ratio:
##   X-squared = 18.0986, df = 4, p-value = 0.001181
## Mantel-Haenszel Chi-squared:
##   X-squared = 8.6654, df = 1, p-value = 0.003243
## 
## Phi-Coefficient        0.123
## Contingency Coeff.     0.122
## Cramer's V             0.087
## 
##                area  Brent Camden Westminster    Sum
## operator                                            
## Allanah  freq          153    123          89    365
##          perc         .128   .103        .075   .306
##          p.row        .419   .337        .244  1.000
##          p.col        .323   .362        .235   .306
## Maria    freq          153    108         122    383
##          perc         .128   .091        .102   .322
##          p.row        .399   .282        .319  1.000
##          p.col        .323   .318        .323   .322
## Rhonda   freq          167    109         167    443
##          perc         .140   .092        .140   .372
##          p.row        .377   .246        .377  1.000
##          p.col        .353   .321        .442   .372
## Sum      freq          473    340         378   1191
##          perc         .397   .285        .317  1.000
##          p.row        .397   .285        .317  1.000
##          p.col       1.000  1.000       1.000  1.000

plot of chunk unnamed-chunk-7

Two numerical variables have no obvious standard description, report simple correlation coefficients (Pearson, Spearman and Kendall).

Desc(temperature ~ delivery_min, d.pizza, plotit=TRUE)

## 
## Call:
## Desc.formula(temperature ~ delivery_min, d.pizza, plotit = TRUE)
## 
## ------------------------------------------------------------------------- 
## temperature ~ delivery_min
## 
## Summary: 
## n pairs: 1'209, valid: 1'170 (97%), missings: 39 (3%)
## 
## 
## Pearson corr. : -0.575
## Spearman corr.: -0.573
## Kendall corr. : -0.422

plot of chunk unnamed-chunk-8

Tables

Frequencies and the percentages:

# A) 
PercTable(d.pizza$driver, d.pizza$city, margins=c(1,2)
          , rfrq="101")

##              freq   perc
##                         
## Butcher        96   .080
## Carpenter     272   .226
## Carter        234   .194
## Farmer        117   .097
## Hunter        156   .130
## Miller        125   .104
## Taylor        204   .169
## Sum          1204  1.000

Expected values and standardized residuals:

# B) 
PercTable(d.pizza$driver, d.pizza$city, margins=c(1,2)
          , rfrq="000", expected=TRUE, stdres=TRUE
          , digits=1)

##             freq        zero        zero
##                                         
## Butcher      96  172.0    .  -6.3     . 
## Carpenter   272  172.0    .   8.2     . 
## Carter      234  172.0    .   5.1     . 
## Farmer      117  172.0    .  -4.5     . 
## Hunter      156  172.0    .  -1.3     . 
## Miller      125  172.0    .  -3.9     . 
## Taylor      204  172.0    .   2.6     . 
## Sum        1204     .     .     .     .

pain <- as.table(matrix(c(26,26,23,18,9,6,7,9,14,23)
                        , nrow=2, byrow=TRUE, 
                        dimnames=list(treat=c("No","Yes"), strength=0:4))
                 )

The verbosity can be set to ‘medium’ ‘low’ and ‘high’:

Desc(pain, verb="high", plotit = TRUE)

## ------------------------------------------------------------------------- 
## pain (table) 
## 
## Summary: 
## n: 161, rows: 2, columns: 5
## 
## Pearson's Chi-squared test:
##   X-squared = 26.6, df = 4, p-value = 2.392e-05
## Likelihood Ratio:
##   X-squared = 26.6689, df = 4, p-value = 2.319e-05
## Mantel-Haenszel Chi-squared:
##   X-squared = 22.8188, df = 1, p-value = 1.78e-06

## Warning: Cannot compute exact p-value with ties

##                        estimate  lwr.ci  upr.ci
## Phi Coeff.               0.4065       -       -
## Contingency Coeff.       0.3766       -       -
## Cramer V                 0.4065  0.2212  0.5411
## Goodman Kruskal Gamma    0.5313  0.3480  0.7146
## Kendall Tau-b            0.3373  0.2114  0.4631
## Stuart Tau-c             0.4111  0.2547  0.5675
## Somers D C|R             0.4427  0.2786  0.6068
## Somers D R|C             0.2569  0.1593  0.3546
## Pearson Correlation      0.3776  0.2368  0.5029
## Spearman Correlation     0.3771  0.2362  0.5024
## Lambda C|R               0.1250  0.0000  0.2547
## Lambda R|C               0.2373  0.0732  0.4014
## Lambda sym               0.1604  0.0388  0.2821
## Uncertainty Coeff. C|R   0.0515  0.0140  0.0890
## Uncertainty Coeff. R|C   0.1261  0.0346  0.2175
## Uncertainty Coeff. sym   0.0731  0.0199  0.1262
## Mutual Information       0.1195       -       -
## 
##             strength      0      1      2      3      4    Sum
## treat                                                         
## No    freq               26     26     23     18      9    102
##       perc             .161   .161   .143   .112   .056   .634
##       p.row            .255   .255   .225   .176   .088  1.000
##       p.col            .812   .788   .719   .562   .281   .634
## Yes   freq                6      7      9     14     23     59
##       perc             .037   .043   .056   .087   .143   .366
##       p.row            .102   .119   .153   .237   .390  1.000
##       p.col            .188   .212   .281   .438   .719   .366
## Sum   freq               32     33     32     32     32    161
##       perc             .199   .205   .199   .199   .199  1.000
##       p.row            .199   .205   .199   .199   .199  1.000
##       p.col           1.000  1.000  1.000  1.000  1.000  1.000

plot of chunk unnamed-chunk-13

Desc(pain, verb="low", plotit = TRUE)

## ------------------------------------------------------------------------- 
## pain (table) 
## 
## Summary: 
## n: 161, rows: 2, columns: 5
## 
## Pearson's Chi-squared test:
##   X-squared = 26.6, df = 4, p-value = 2.392e-05
## 
##       strength      0      1      2      3      4    Sum
## treat                                                   
## No                 26     26     23     18      9    102
## Yes                 6      7      9     14     23     59
## Sum                32     33     32     32     32    161

plot of chunk unnamed-chunk-14

2x2 table

heart <- as.table(matrix(c(11,4,2,6), nrow=2, byrow=TRUE, 
                       dimnames=list(Cholesterol=c("High","Low"), Response=c("Yes","No"))))
Desc(heart, horiz=FALSE, plotit = TRUE)

## ------------------------------------------------------------------------- 
## heart (table) 
## 
## Summary: 
## n: 23, rows: 2, columns: 2
## 
## Fisher's exact test p-value = 0.03931
## McNemar's chi-squared = 0.1667, df = 1, p-value = 0.6831
## 
##                     estimate lwr.ci upr.ci
##                                           
## odds ratio             8.250  1.154 59.003
## rel. risk (col1)       2.933  0.850 10.120
## rel. risk (col2)       0.356  0.140  0.901
## 
## Phi-Coefficient        0.464
## Contingency Coeff.     0.421
## Cramer's V             0.464
## 
##                   Response    Yes     No    Sum
## Cholesterol                                    
## High        freq               11      4     15
##             perc             .478   .174   .652
##             p.row            .733   .267  1.000
##             p.col            .846   .400   .652
## Low         freq                2      6      8
##             perc             .087   .261   .348
##             p.row            .250   .750  1.000
##             p.col            .154   .600   .348
## Sum         freq               13     10     23
##             perc             .565   .435  1.000
##             p.row            .565   .435  1.000
##             p.col           1.000  1.000  1.000

plot of chunk unnamed-chunk-15

DescTools

M.Devlin

Friday, September 26, 2014

Load DescTools

Describe variables in a data frame:

Simple frequencies:

Pairwise descriptions:

Tables

2x2 table