Section 1: mtcars distributions

Load up the packages we’ll need

library(datasets)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Identify the variables for mtcars

Identify each variable and datatype

#First,take a look at what we've got
tbl_mtcars <- tbl_df(mtcars)
tbl_mtcars
# A tibble: 32 × 11
     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
*  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1   21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4
2   21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4
3   22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1
4   21.4     6 258.0   110  3.08 3.215 19.44     1     0     3     1
5   18.7     8 360.0   175  3.15 3.440 17.02     0     0     3     2
6   18.1     6 225.0   105  2.76 3.460 20.22     1     0     3     1
7   14.3     8 360.0   245  3.21 3.570 15.84     0     0     3     4
8   24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
9   22.8     4 140.8    95  3.92 3.150 22.90     1     0     4     2
10  19.2     6 167.6   123  3.92 3.440 18.30     1     0     4     4
# ... with 22 more rows
#Identify each variable and datatype in mtcars.
class(mtcars)
[1] "data.frame"
mtcars_var <- c(sapply(mtcars,mode))
mtcars_var
      mpg       cyl      disp        hp      drat        wt      qsec 
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
       vs        am      gear      carb 
"numeric" "numeric" "numeric" "numeric" 
#This makes it easier for me to see
cbind (mtcars_var)
     mtcars_var
mpg  "numeric" 
cyl  "numeric" 
disp "numeric" 
hp   "numeric" 
drat "numeric" 
wt   "numeric" 
qsec "numeric" 
vs   "numeric" 
am   "numeric" 
gear "numeric" 
carb "numeric" 

Next, report the classification, either discrete or continuous

In looking at the variable data, determine and make a list of whether it is discrete or variable
class_list_mtcars <- c("Variable", "Discrete","Variable", "Variable", "Variable","Variable",  "Variable", "Discrete", "Discrete", "Discrete", "Discrete") 

Then, knit the two vectors together in a data frame, making sure the varible definition matches in the rows created!

df_mtcars_var <- data.frame(mtcars_var, class_list_mtcars)

For the table to make sense, give it headers:

mtcars_c1 <- "Data Type"
mtcars_c2 <- "Variable Class."
names(df_mtcars_var) <- c(mtcars_c1, mtcars_c2)

print (df_mtcars_var)
     Data Type Variable Class.
mpg    numeric        Variable
cyl    numeric        Discrete
disp   numeric        Variable
hp     numeric        Variable
drat   numeric        Variable
wt     numeric        Variable
qsec   numeric        Variable
vs     numeric        Discrete
am     numeric        Discrete
gear   numeric        Discrete
carb   numeric        Discrete

Report the distribution of the entire dataset of the variables using summary. This wasn’t part of the assignment, but I wanted to take a look at it. See next entry.

summary(tbl_mtcars)
      mpg             cyl             disp             hp       
 Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
 1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
 Median :19.20   Median :6.000   Median :196.3   Median :123.0  
 Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
 3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
 Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
      drat             wt             qsec             vs        
 Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
 1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
 Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
 Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
 3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
 Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
       am              gear            carb      
 Min.   :0.0000   Min.   :3.000   Min.   :1.000  
 1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
 Median :0.0000   Median :4.000   Median :2.000  
 Mean   :0.4062   Mean   :3.688   Mean   :2.812  
 3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
 Max.   :1.0000   Max.   :5.000   Max.   :8.000  

Now select three variable and report their distribution

#-pull out the values from three varables from the dataframe
mtcars_v1_mpg <- mtcars$mpg
mtcars_v2_cyl <- mtcars$cyl
mtcars_v3_carb <- mtcars$carb

First, the easy ones: cyl and carb. Since these are ordinal and not ratio, the groupings are cleaner (read: easier)

mtcars_v2_cyl_tbl <- table (mtcars_v2_cyl)
mtcars_v2_bind <- cbind(mtcars_v2_cyl_tbl) #because I read it better this way
mtcars_v2_bind
  mtcars_v2_cyl_tbl
4                11
6                 7
8                14
mtcars_v3_carb_tbl <- table (mtcars_v3_carb)
mtcars_v3_bind <- cbind(mtcars_v3_carb_tbl)
mtcars_v3_bind
  mtcars_v3_carb_tbl
1                  7
2                 10
3                  3
4                 10
6                  1
8                  1

The mpg variable is continuous, so we will group them a bit before displaying the distribution

#First, look at the data to help decide how to break it down
summary(mtcars_v1_mpg)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10.40   15.42   19.20   20.09   22.80   33.90 
mtcars_v1_mpg
 [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2
[15] 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4
[29] 15.8 19.7 15.0 21.4

Next, create the sequence groups

mpg_seq <- seq(10,40, by=5)
mpg_seq
[1] 10 15 20 25 30 35 40

Now CUT into groups and print it

mtcars_v1_mpg_cut <- cut(mtcars$mpg, mpg_seq, right=FALSE)

#Summarize into a table and cbind.
mtcars_v1_dist <- table(mtcars_v1_mpg_cut)
cbind (mtcars_v1_dist)
        mtcars_v1_dist
[10,15)              5
[15,20)             13
[20,25)              8
[25,30)              2
[30,35)              4
[35,40)              0

Next Section, we’re using the data from esoph.

Pull out the data and assign varables so we can see what we’re dealing with

#First, let's see what we're dealing with
summary(esoph)
   agegp          alcgp         tobgp        ncases         ncontrols    
 25-34:15   0-39g/day:23   0-9g/day:24   Min.   : 0.000   Min.   : 1.00  
 35-44:15   40-79    :23   10-19   :24   1st Qu.: 0.000   1st Qu.: 3.00  
 45-54:16   80-119   :21   20-29   :20   Median : 1.000   Median : 6.00  
 55-64:16   120+     :21   30+     :20   Mean   : 2.273   Mean   :11.08  
 65-74:15                                3rd Qu.: 4.000   3rd Qu.:14.00  
 75+  :11                                Max.   :17.000   Max.   :60.00  
#create a tibble
tbl_esoph <- tbl_df(esoph)
tbl_esoph
# A tibble: 88 × 5
   agegp     alcgp    tobgp ncases ncontrols
   <ord>     <ord>    <ord>  <dbl>     <dbl>
1  25-34 0-39g/day 0-9g/day      0        40
2  25-34 0-39g/day    10-19      0        10
3  25-34 0-39g/day    20-29      0         6
4  25-34 0-39g/day      30+      0         5
5  25-34     40-79 0-9g/day      0        27
6  25-34     40-79    10-19      0         7
7  25-34     40-79    20-29      0         4
8  25-34     40-79      30+      0         7
9  25-34    80-119 0-9g/day      0         2
10 25-34    80-119    10-19      0         1
# ... with 78 more rows
#Pull out the variable/vectors we'll be dealing with
esoph_agegp <- esoph$agegp
esoph_alcgp <- esoph$alcgp
esoph_tobgp <- esoph$tobgp

As in mtcars, identify the data type and classification

#Identify each variable and datatype in mtcars.
esoph_var <- c(sapply(esoph,mode))
esoph_var
    agegp     alcgp     tobgp    ncases ncontrols 
"numeric" "numeric" "numeric" "numeric" "numeric" 
#This makes it easier for me to see
cbind (esoph_var)
          esoph_var
agegp     "numeric"
alcgp     "numeric"
tobgp     "numeric"
ncases    "numeric"
ncontrols "numeric"
#Next, report the classification, either discrete or continuous
#In looking at the variable data, determine and make a list of whether it is discrete or variable
class_list_esoph <- c("Variable", "Discrete","Discrete", "Variable", "Variable")

#Then, knit the two vectors together in a data frame, making sure the varible definition matches in the rows created!
df_esoph_var <- data.frame(esoph_var, class_list_esoph)

#for the table to make sense, give it headers:
esoph_c1 <- "Data Type"
esoph_c2 <- "Variable Class."
names(df_esoph_var) <- c(esoph_c1, esoph_c2)
print (df_esoph_var)
          Data Type Variable Class.
agegp       numeric        Variable
alcgp       numeric        Discrete
tobgp       numeric        Discrete
ncases      numeric        Variable
ncontrols   numeric        Variable

Report the frequency and relative frequency of the three variables agegp, alcgp, and tobgp.

#Frequency of each variable: 
esoph_agege_tbl <- table(esoph_agegp)
cbind(esoph_agege_tbl)
      esoph_agege_tbl
25-34              15
35-44              15
45-54              16
55-64              16
65-74              15
75+                11
esoph_alcgp_tbl <- table(esoph_alcgp)
cbind(esoph_alcgp_tbl)
          esoph_alcgp_tbl
0-39g/day              23
40-79                  23
80-119                 21
120+                   21
esoph_tobgp_tbl <- table(esoph_tobgp)
cbind(esoph_agege_tbl)
      esoph_agege_tbl
25-34              15
35-44              15
45-54              16
55-64              16
65-74              15
75+                11

Now, calcualate the relative frequency distribution for each

#Step 1: count the total number of observations in the table
esoph_nrow <- nrow(esoph)
#Step 2: Calc the percent of total to two decimals
esoph_age_relF <- round(esoph_agege_tbl/esoph_nrow, digits = 2)
cbind(esoph_age_relF)
      esoph_age_relF
25-34           0.17
35-44           0.17
45-54           0.18
55-64           0.18
65-74           0.17
75+             0.12
#same for algp
esoph_alcgp_relF <- round (esoph_alcgp_tbl/esoph_nrow, digits = 2)
cbind(esoph_alcgp_relF)
          esoph_alcgp_relF
0-39g/day             0.26
40-79                 0.26
80-119                0.24
120+                  0.24
#same for tobgp
esoph_tobgp_relF <- round (esoph_tobgp_tbl/esoph_nrow, digits = 2)
cbind(esoph_tobgp_relF)
         esoph_tobgp_relF
0-9g/day             0.27
10-19                0.27
20-29                0.23
30+                  0.23

Now, we will report the joint frequency of agegp and alcgp

using esoph_agegp and esoph_alcgp variables created earlier

use table and ftable to clean it up a bit

esoph_agegp_x_alcgp <- table(esoph_agegp, esoph_alcgp)
ftable(esoph_agegp_x_alcgp)
            esoph_alcgp 0-39g/day 40-79 80-119 120+
esoph_agegp                                        
25-34                           4     4      3    4
35-44                           4     4      4    3
45-54                           4     4      4    4
55-64                           4     4      4    4
65-74                           4     3      4    4
75+                             3     4      2    2

Next we will report the joint frequency of alcgp and tobgp…this time, use xtabs

esoph_alcgp_x_tobgp <- xtabs(~esoph_alcgp + esoph_tobgp)
ftable(esoph_alcgp_x_tobgp)
            esoph_tobgp 0-9g/day 10-19 20-29 30+
esoph_alcgp                                     
0-39g/day                      6     6     5   6
40-79                          6     6     6   5
80-119                         6     6     4   5
120+                           6     6     5   4

Diamonds are forever

Set up the data and varaibles, take a look at what we have to work with

library(ggplot2)  #to have dataset diamonds available

tbl_diamonds <- tbl_df(diamonds)
summary (tbl_diamonds)
     carat               cut        color        clarity     
 Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
 1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
 Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
 Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
 3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
 Max.   :5.0100                     I: 5422   VVS1   : 3655  
                                    J: 2808   (Other): 2531  
     depth           table           price             x         
 Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
 1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
 Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
 Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
 3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
 Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
                                                                 
       y                z         
 Min.   : 0.000   Min.   : 0.000  
 1st Qu.: 4.720   1st Qu.: 2.910  
 Median : 5.710   Median : 3.530  
 Mean   : 5.735   Mean   : 3.539  
 3rd Qu.: 6.540   3rd Qu.: 4.040  
 Max.   :58.900   Max.   :31.800  
                                  
#pull out needed variables in vectors
dia_carat <- diamonds$carat
dia_price <- diamonds$price
dia_depth <- diamonds$depth
dia_tbl <- diamonds$table

Report a grouped frequency of carat and price

#Break the range of carat down
carat_range <- seq(0, 5.1, by=0.2)

carat_group <- cut(dia_carat, carat_range, right=FALSE)
carat_group_tbl <- table(carat_group)
cbind(carat_group_tbl)
##           carat_group_tbl
## [0,0.2)                 0
## [0.2,0.4)           13092
## [0.4,0.6)           11356
## [0.6,0.8)            6679
## [0.8,1)              3753
## [1,1.2)              9905
## [1.2,1.4)            2746
## [1.4,1.6)            3066
## [1.6,1.8)            1048
## [1.8,2)               141
## [2,2.2)              1660
## [2.2,2.4)             306
## [2.4,2.6)             118
## [2.6,2.8)              30
## [2.8,3)                 0
## [3,3.2)                27
## [3.2,3.4)               3
## [3.4,3.6)               2
## [3.6,3.8)               2
## [3.8,4)                 0
## [4,4.2)                 4
## [4.2,4.4)               0
## [4.4,4.6)               1
## [4.6,4.8)               0
## [4.8,5)                 0
#Break down the prices
price_range <- seq(0,20000, by = 1000)

price_group <- cut(dia_price, price_range, right = FALSE)
price_group_tbl <- table(price_group)
cbind(price_group_tbl)
##                   price_group_tbl
## [0,1e+03)                   14499
## [1e+03,2e+03)                9704
## [2e+03,3e+03)                6131
## [3e+03,4e+03)                4226
## [4e+03,5e+03)                4653
## [5e+03,6e+03)                3174
## [6e+03,7e+03)                2278
## [7e+03,8e+03)                1669
## [8e+03,9e+03)                1307
## [9e+03,1e+04)                1076
## [1e+04,1.1e+04)               935
## [1.1e+04,1.2e+04)             824
## [1.2e+04,1.3e+04)             702
## [1.3e+04,1.4e+04)             603
## [1.4e+04,1.5e+04)             503
## [1.5e+04,1.6e+04)             514
## [1.6e+04,1.7e+04)             424
## [1.7e+04,1.8e+04)             406
## [1.8e+04,1.9e+04)             312
## [1.9e+04,2e+04)                 0

Finally, create histograms for each of the four variables

hist(dia_carat, main ="Histogram for carat", xlab = "Carat", col="green")

hist(dia_price, main ="Histogram for price", xlab = "Price", col = "blue")

hist(dia_depth, main ="Histogram for depth", xlab = "Depth", col = "brown")

hist(dia_tbl, main ="Histogram for table", xlab = "Table", col = "yellow")