MTCARS Dataset

library(datasets)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
mtcars <- tbl_df(mtcars)
mtcars
## # A tibble: 32 x 11
##      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##  * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4
##  2  21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4
##  3  22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1
##  4  21.4     6 258.0   110  3.08 3.215 19.44     1     0     3     1
##  5  18.7     8 360.0   175  3.15 3.440 17.02     0     0     3     2
##  6  18.1     6 225.0   105  2.76 3.460 20.22     1     0     3     1
##  7  14.3     8 360.0   245  3.21 3.570 15.84     0     0     3     4
##  8  24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
##  9  22.8     4 140.8    95  3.92 3.150 22.90     1     0     4     2
## 10  19.2     6 167.6   123  3.92 3.440 18.30     1     0     4     4
## # ... with 22 more rows

Determining if classifications of variables are discrete or continuous

mpg=continuous

cyl=discrete

disp=continuous

hp=continuous

drat=continuous

wt=continuous

qsec=continuous

vs=discrete

gear=discrete

carb=discrete

library(tibble)

Summary of mtcars

summary(mtcars)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

distribution of 3 variables in mtcars

mtcars$hp
##  [1] 110 110  93 110 175 105 245  62  95 123 123 180 180 180 205 215 230
## [18]  66  52  65  97 150 150 245 175  66  91 113 264 175 335 109
mtcars$wt
##  [1] 2.620 2.875 2.320 3.215 3.440 3.460 3.570 3.190 3.150 3.440 3.440
## [12] 4.070 3.730 3.780 5.250 5.424 5.345 2.200 1.615 1.835 2.465 3.520
## [23] 3.435 3.840 3.845 1.935 2.140 1.513 3.170 2.770 3.570 2.780
mtcars$carb
##  [1] 4 4 1 1 2 1 4 2 2 4 4 3 3 3 4 4 4 1 2 1 1 2 2 4 2 1 2 2 4 6 8 2

ESOPH Dataset

data type of 3 vairables in esoph

esoph <- tbl_df(esoph)
esoph
## # A tibble: 88 x 5
##    agegp     alcgp    tobgp ncases ncontrols
##    <ord>     <ord>    <ord>  <dbl>     <dbl>
##  1 25-34 0-39g/day 0-9g/day      0        40
##  2 25-34 0-39g/day    10-19      0        10
##  3 25-34 0-39g/day    20-29      0         6
##  4 25-34 0-39g/day      30+      0         5
##  5 25-34     40-79 0-9g/day      0        27
##  6 25-34     40-79    10-19      0         7
##  7 25-34     40-79    20-29      0         4
##  8 25-34     40-79      30+      0         7
##  9 25-34    80-119 0-9g/day      0         2
## 10 25-34    80-119    10-19      0         1
## # ... with 78 more rows

agegp=discrete

alcgp=discrete

tobgp=discrete

Frequency Distribution of agegp

esoph$agegp
##  [1] 25-34 25-34 25-34 25-34 25-34 25-34 25-34 25-34 25-34 25-34 25-34
## [12] 25-34 25-34 25-34 25-34 35-44 35-44 35-44 35-44 35-44 35-44 35-44
## [23] 35-44 35-44 35-44 35-44 35-44 35-44 35-44 35-44 45-54 45-54 45-54
## [34] 45-54 45-54 45-54 45-54 45-54 45-54 45-54 45-54 45-54 45-54 45-54
## [45] 45-54 45-54 55-64 55-64 55-64 55-64 55-64 55-64 55-64 55-64 55-64
## [56] 55-64 55-64 55-64 55-64 55-64 55-64 55-64 65-74 65-74 65-74 65-74
## [67] 65-74 65-74 65-74 65-74 65-74 65-74 65-74 65-74 65-74 65-74 65-74
## [78] 75+   75+   75+   75+   75+   75+   75+   75+   75+   75+   75+  
## Levels: 25-34 < 35-44 < 45-54 < 55-64 < 65-74 < 75+
agegp.freq <- table(esoph$agegp)
agegp.freq
## 
## 25-34 35-44 45-54 55-64 65-74   75+ 
##    15    15    16    16    15    11
cbind(agegp.freq)
##       agegp.freq
## 25-34         15
## 35-44         15
## 45-54         16
## 55-64         16
## 65-74         15
## 75+           11

Relative Frequency of agegp

nrow(agegp.freq)
## [1] 6
agegp.relfreq <- agegp.freq/nrow(agegp.freq)
agegp.relfreq
## 
##    25-34    35-44    45-54    55-64    65-74      75+ 
## 2.500000 2.500000 2.666667 2.666667 2.500000 1.833333
round.agegp.relfreq <- round(agegp.relfreq, digits = 2)
round.agegp.relfreq
## 
## 25-34 35-44 45-54 55-64 65-74   75+ 
##  2.50  2.50  2.67  2.67  2.50  1.83

Frequency Distribution of alcgp

esoph$alcgp
##  [1] 0-39g/day 0-39g/day 0-39g/day 0-39g/day 40-79     40-79     40-79    
##  [8] 40-79     80-119    80-119    80-119    120+      120+      120+     
## [15] 120+      0-39g/day 0-39g/day 0-39g/day 0-39g/day 40-79     40-79    
## [22] 40-79     40-79     80-119    80-119    80-119    80-119    120+     
## [29] 120+      120+      0-39g/day 0-39g/day 0-39g/day 0-39g/day 40-79    
## [36] 40-79     40-79     40-79     80-119    80-119    80-119    80-119   
## [43] 120+      120+      120+      120+      0-39g/day 0-39g/day 0-39g/day
## [50] 0-39g/day 40-79     40-79     40-79     40-79     80-119    80-119   
## [57] 80-119    80-119    120+      120+      120+      120+      0-39g/day
## [64] 0-39g/day 0-39g/day 0-39g/day 40-79     40-79     40-79     80-119   
## [71] 80-119    80-119    80-119    120+      120+      120+      120+     
## [78] 0-39g/day 0-39g/day 0-39g/day 40-79     40-79     40-79     40-79    
## [85] 80-119    80-119    120+      120+     
## Levels: 0-39g/day < 40-79 < 80-119 < 120+
alcgp.freq <- table(esoph$alcgp)
alcgp.freq
## 
## 0-39g/day     40-79    80-119      120+ 
##        23        23        21        21
cbind(alcgp.freq)
##           alcgp.freq
## 0-39g/day         23
## 40-79             23
## 80-119            21
## 120+              21

Relative Frequency of alcgp

nrow(alcgp.freq)
## [1] 4
alcgp.relfreq <- alcgp.freq/nrow(alcgp.freq)
alcgp.relfreq
## 
## 0-39g/day     40-79    80-119      120+ 
##      5.75      5.75      5.25      5.25

Frequency Distribution of tobgp

esoph$tobgp
##  [1] 0-9g/day 10-19    20-29    30+      0-9g/day 10-19    20-29   
##  [8] 30+      0-9g/day 10-19    30+      0-9g/day 10-19    20-29   
## [15] 30+      0-9g/day 10-19    20-29    30+      0-9g/day 10-19   
## [22] 20-29    30+      0-9g/day 10-19    20-29    30+      0-9g/day
## [29] 10-19    20-29    0-9g/day 10-19    20-29    30+      0-9g/day
## [36] 10-19    20-29    30+      0-9g/day 10-19    20-29    30+     
## [43] 0-9g/day 10-19    20-29    30+      0-9g/day 10-19    20-29   
## [50] 30+      0-9g/day 10-19    20-29    30+      0-9g/day 10-19   
## [57] 20-29    30+      0-9g/day 10-19    20-29    30+      0-9g/day
## [64] 10-19    20-29    30+      0-9g/day 10-19    20-29    0-9g/day
## [71] 10-19    20-29    30+      0-9g/day 10-19    20-29    30+     
## [78] 0-9g/day 10-19    30+      0-9g/day 10-19    20-29    30+     
## [85] 0-9g/day 10-19    0-9g/day 10-19   
## Levels: 0-9g/day < 10-19 < 20-29 < 30+
tobgp.freq <- table(esoph$tobgp)
tobgp.freq
## 
## 0-9g/day    10-19    20-29      30+ 
##       24       24       20       20
cbind(tobgp.freq)
##          tobgp.freq
## 0-9g/day         24
## 10-19            24
## 20-29            20
## 30+              20

Relative frequency of tobgp

nrow(tobgp.freq)
## [1] 4
tobgp.relfreq <- tobgp.freq/nrow(tobgp.freq)
tobgp.relfreq
## 
## 0-9g/day    10-19    20-29      30+ 
##        6        6        5        5

Joint Frequency of agegp and alcgp

y <- c(esoph$alcgp)
y
##  [1] 1 1 1 1 2 2 2 2 3 3 3 4 4 4 4 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 1 1 1 1 2
## [36] 2 2 2 3 3 3 3 4 4 4 4 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 1 1 1 1 2 2 2 3
## [71] 3 3 3 4 4 4 4 1 1 1 2 2 2 2 3 3 4 4
x <- c(esoph$agegp)
x
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5
## [71] 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6
esophtab <- table(x,y)
esophtab
##    y
## x   1 2 3 4
##   1 4 4 3 4
##   2 4 4 4 3
##   3 4 4 4 4
##   4 4 4 4 4
##   5 4 3 4 4
##   6 3 4 2 2
ftable(esophtab)
##   y 1 2 3 4
## x          
## 1   4 4 3 4
## 2   4 4 4 3
## 3   4 4 4 4
## 4   4 4 4 4
## 5   4 3 4 4
## 6   3 4 2 2

Joint Frequency of aclgp and tobgp

y <- c(esoph$tobgp)
y
##  [1] 1 2 3 4 1 2 3 4 1 2 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 1 2 3 4 1
## [36] 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 1
## [71] 2 3 4 1 2 3 4 1 2 4 1 2 3 4 1 2 1 2
x <- c(esoph$alcgp)
x
##  [1] 1 1 1 1 2 2 2 2 3 3 3 4 4 4 4 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 1 1 1 1 2
## [36] 2 2 2 3 3 3 3 4 4 4 4 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 1 1 1 1 2 2 2 3
## [71] 3 3 3 4 4 4 4 1 1 1 2 2 2 2 3 3 4 4
esophtab2 <- table(x,y)
esophtab2
##    y
## x   1 2 3 4
##   1 6 6 5 6
##   2 6 6 6 5
##   3 6 6 4 5
##   4 6 6 5 4
ftable(esophtab2)
##   y 1 2 3 4
## x          
## 1   6 6 5 6
## 2   6 6 6 5
## 3   6 6 4 5
## 4   6 6 5 4

DIAMONDS DATASET #InsertingDiamonds Dataset

library(ggplot2)
diamonds
## # A tibble: 53,940 x 10
##    carat       cut color clarity depth table price     x     y     z
##    <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
##  2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
##  3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
##  4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
##  5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
##  6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
##  7  0.24 Very Good     I    VVS1  62.3    57   336  3.95  3.98  2.47
##  8  0.26 Very Good     H     SI1  61.9    55   337  4.07  4.11  2.53
##  9  0.22      Fair     E     VS2  65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good     H     VS1  59.4    61   338  4.00  4.05  2.39
## # ... with 53,930 more rows

Range of Price

range(diamonds$price)
## [1]   326 18823

Range of Carat

range(diamonds$carat)
## [1] 0.20 5.01

Range of Depth

range(diamonds$depth)
## [1] 43 79

Range of Table

range(diamonds$table)
## [1] 43 95

Grouped Frequency of Depth

library(tibble)
library(readr)
diamonds <- tbl_df (diamonds)
diamonds
## # A tibble: 53,940 x 10
##    carat       cut color clarity depth table price     x     y     z
##    <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
##  2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
##  3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
##  4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
##  5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
##  6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
##  7  0.24 Very Good     I    VVS1  62.3    57   336  3.95  3.98  2.47
##  8  0.26 Very Good     H     SI1  61.9    55   337  4.07  4.11  2.53
##  9  0.22      Fair     E     VS2  65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good     H     VS1  59.4    61   338  4.00  4.05  2.39
## # ... with 53,930 more rows
summary(diamonds)
##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
## 
breaks.depth = seq(40,80, by = 5)
breaks.depth
## [1] 40 45 50 55 60 65 70 75 80
depth.cut = cut(diamonds$depth, breaks.depth, right = FALSE)
depth.freq = table(depth.cut)
depth.freq
## depth.cut
## [40,45) [45,50) [50,55) [55,60) [60,65) [65,70) [70,75) [75,80) 
##       3       0      19    5092   47932     870      21       3
cbind(depth.freq)
##         depth.freq
## [40,45)          3
## [45,50)          0
## [50,55)         19
## [55,60)       5092
## [60,65)      47932
## [65,70)        870
## [70,75)         21
## [75,80)          3

Grouped Frequency of Table

breaks.table = seq(40, 95, by=5)
breaks.table
##  [1] 40 45 50 55 60 65 70 75 80 85 90 95
table.cut = cut(diamonds$table, breaks.table, right = FALSE)
table.freq = table(table.cut)
table.freq
## table.cut
## [40,45) [45,50) [50,55) [55,60) [60,65) [65,70) [70,75) [75,80) [80,85) 
##       2       2    3571   41278    8760     310      14       2       0 
## [85,90) [90,95) 
##       0       0
cbind(table.freq)
##         table.freq
## [40,45)          2
## [45,50)          2
## [50,55)       3571
## [55,60)      41278
## [60,65)       8760
## [65,70)        310
## [70,75)         14
## [75,80)          2
## [80,85)          0
## [85,90)          0
## [90,95)          0

Histogram of Price

hist(diamonds$price)

Histogram of Carat

hist(diamonds$carat)

Histogram of Depth

hist(diamonds$depth)

Histogram of Table

hist(diamonds$table)

knitr::opts_chunk$set(echo = TRUE) ```