library(datasets)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
#First,take a look at what we've got
tbl_mtcars <- tbl_df(mtcars)
tbl_mtcars
# A tibble: 32 × 11
mpg cyl disp hp drat wt qsec vs am gear carb
* <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
# ... with 22 more rows
#Identify each variable and datatype in mtcars.
class(mtcars)
[1] "data.frame"
mtcars_var <- c(sapply(mtcars,mode))
mtcars_var
mpg cyl disp hp drat wt qsec
"numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
vs am gear carb
"numeric" "numeric" "numeric" "numeric"
#This makes it easier for me to see
cbind (mtcars_var)
mtcars_var
mpg "numeric"
cyl "numeric"
disp "numeric"
hp "numeric"
drat "numeric"
wt "numeric"
qsec "numeric"
vs "numeric"
am "numeric"
gear "numeric"
carb "numeric"
class_list_mtcars <- c("Variable", "Discrete","Variable", "Variable", "Variable","Variable", "Variable", "Discrete", "Discrete", "Discrete", "Discrete")
df_mtcars_var <- data.frame(mtcars_var, class_list_mtcars)
mtcars_c1 <- "Data Type"
mtcars_c2 <- "Variable Class."
names(df_mtcars_var) <- c(mtcars_c1, mtcars_c2)
print (df_mtcars_var)
Data Type Variable Class.
mpg numeric Variable
cyl numeric Discrete
disp numeric Variable
hp numeric Variable
drat numeric Variable
wt numeric Variable
qsec numeric Variable
vs numeric Discrete
am numeric Discrete
gear numeric Discrete
carb numeric Discrete
summary(tbl_mtcars)
mpg cyl disp hp
Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
Median :19.20 Median :6.000 Median :196.3 Median :123.0
Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
drat wt qsec vs
Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
Median :3.695 Median :3.325 Median :17.71 Median :0.0000
Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
am gear carb
Min. :0.0000 Min. :3.000 Min. :1.000
1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
Median :0.0000 Median :4.000 Median :2.000
Mean :0.4062 Mean :3.688 Mean :2.812
3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
Max. :1.0000 Max. :5.000 Max. :8.000
#-pull out the values from three varables from the dataframe
mtcars_v1_mpg <- mtcars$mpg
mtcars_v2_cyl <- mtcars$cyl
mtcars_v3_carb <- mtcars$carb
mtcars_v2_cyl_tbl <- table (mtcars_v2_cyl)
mtcars_v2_bind <- cbind(mtcars_v2_cyl_tbl) #because I read it better this way
mtcars_v2_bind
mtcars_v2_cyl_tbl
4 11
6 7
8 14
mtcars_v3_carb_tbl <- table (mtcars_v3_carb)
mtcars_v3_bind <- cbind(mtcars_v3_carb_tbl)
mtcars_v3_bind
mtcars_v3_carb_tbl
1 7
2 10
3 3
4 10
6 1
8 1
#First, look at the data to help decide how to break it down
summary(mtcars_v1_mpg)
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.40 15.42 19.20 20.09 22.80 33.90
mtcars_v1_mpg
[1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2
[15] 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4
[29] 15.8 19.7 15.0 21.4
mpg_seq <- seq(10,40, by=5)
mpg_seq
[1] 10 15 20 25 30 35 40
mtcars_v1_mpg_cut <- cut(mtcars$mpg, mpg_seq, right=FALSE)
#Summarize into a table and cbind.
mtcars_v1_dist <- table(mtcars_v1_mpg_cut)
cbind (mtcars_v1_dist)
mtcars_v1_dist
[10,15) 5
[15,20) 13
[20,25) 8
[25,30) 2
[30,35) 4
[35,40) 0
#First, let's see what we're dealing with
summary(esoph)
agegp alcgp tobgp ncases ncontrols
25-34:15 0-39g/day:23 0-9g/day:24 Min. : 0.000 Min. : 1.00
35-44:15 40-79 :23 10-19 :24 1st Qu.: 0.000 1st Qu.: 3.00
45-54:16 80-119 :21 20-29 :20 Median : 1.000 Median : 6.00
55-64:16 120+ :21 30+ :20 Mean : 2.273 Mean :11.08
65-74:15 3rd Qu.: 4.000 3rd Qu.:14.00
75+ :11 Max. :17.000 Max. :60.00
#create a tibble
tbl_esoph <- tbl_df(esoph)
tbl_esoph
# A tibble: 88 × 5
agegp alcgp tobgp ncases ncontrols
<ord> <ord> <ord> <dbl> <dbl>
1 25-34 0-39g/day 0-9g/day 0 40
2 25-34 0-39g/day 10-19 0 10
3 25-34 0-39g/day 20-29 0 6
4 25-34 0-39g/day 30+ 0 5
5 25-34 40-79 0-9g/day 0 27
6 25-34 40-79 10-19 0 7
7 25-34 40-79 20-29 0 4
8 25-34 40-79 30+ 0 7
9 25-34 80-119 0-9g/day 0 2
10 25-34 80-119 10-19 0 1
# ... with 78 more rows
#Pull out the variable/vectors we'll be dealing with
esoph_agegp <- esoph$agegp
esoph_alcgp <- esoph$alcgp
esoph_tobgp <- esoph$tobgp
#Identify each variable and datatype in mtcars.
esoph_var <- c(sapply(esoph,mode))
esoph_var
agegp alcgp tobgp ncases ncontrols
"numeric" "numeric" "numeric" "numeric" "numeric"
#This makes it easier for me to see
cbind (esoph_var)
esoph_var
agegp "numeric"
alcgp "numeric"
tobgp "numeric"
ncases "numeric"
ncontrols "numeric"
#Next, report the classification, either discrete or continuous
#In looking at the variable data, determine and make a list of whether it is discrete or variable
class_list_esoph <- c("Variable", "Discrete","Discrete", "Variable", "Variable")
#Then, knit the two vectors together in a data frame, making sure the varible definition matches in the rows created!
df_esoph_var <- data.frame(esoph_var, class_list_esoph)
#for the table to make sense, give it headers:
esoph_c1 <- "Data Type"
esoph_c2 <- "Variable Class."
names(df_esoph_var) <- c(esoph_c1, esoph_c2)
print (df_esoph_var)
Data Type Variable Class.
agegp numeric Variable
alcgp numeric Discrete
tobgp numeric Discrete
ncases numeric Variable
ncontrols numeric Variable
#Frequency of each variable:
esoph_agege_tbl <- table(esoph_agegp)
cbind(esoph_agege_tbl)
esoph_agege_tbl
25-34 15
35-44 15
45-54 16
55-64 16
65-74 15
75+ 11
esoph_alcgp_tbl <- table(esoph_alcgp)
cbind(esoph_alcgp_tbl)
esoph_alcgp_tbl
0-39g/day 23
40-79 23
80-119 21
120+ 21
esoph_tobgp_tbl <- table(esoph_tobgp)
cbind(esoph_agege_tbl)
esoph_agege_tbl
25-34 15
35-44 15
45-54 16
55-64 16
65-74 15
75+ 11
#Step 1: count the total number of observations in the table
esoph_nrow <- nrow(esoph)
#Step 2: Calc the percent of total to two decimals
esoph_age_relF <- round(esoph_agege_tbl/esoph_nrow, digits = 2)
cbind(esoph_age_relF)
esoph_age_relF
25-34 0.17
35-44 0.17
45-54 0.18
55-64 0.18
65-74 0.17
75+ 0.12
#same for algp
esoph_alcgp_relF <- round (esoph_alcgp_tbl/esoph_nrow, digits = 2)
cbind(esoph_alcgp_relF)
esoph_alcgp_relF
0-39g/day 0.26
40-79 0.26
80-119 0.24
120+ 0.24
#same for tobgp
esoph_tobgp_relF <- round (esoph_tobgp_tbl/esoph_nrow, digits = 2)
cbind(esoph_tobgp_relF)
esoph_tobgp_relF
0-9g/day 0.27
10-19 0.27
20-29 0.23
30+ 0.23
esoph_agegp_x_alcgp <- table(esoph_agegp, esoph_alcgp)
ftable(esoph_agegp_x_alcgp)
esoph_alcgp 0-39g/day 40-79 80-119 120+
esoph_agegp
25-34 4 4 3 4
35-44 4 4 4 3
45-54 4 4 4 4
55-64 4 4 4 4
65-74 4 3 4 4
75+ 3 4 2 2
esoph_alcgp_x_tobgp <- xtabs(~esoph_alcgp + esoph_tobgp)
ftable(esoph_alcgp_x_tobgp)
esoph_tobgp 0-9g/day 10-19 20-29 30+
esoph_alcgp
0-39g/day 6 6 5 6
40-79 6 6 6 5
80-119 6 6 4 5
120+ 6 6 5 4
library(ggplot2) #to have dataset diamonds available
tbl_diamonds <- tbl_df(diamonds)
summary (tbl_diamonds)
carat cut color clarity
Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
Max. :5.0100 I: 5422 VVS1 : 3655
J: 2808 (Other): 2531
depth table price x
Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
Median :61.80 Median :57.00 Median : 2401 Median : 5.700
Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
y z
Min. : 0.000 Min. : 0.000
1st Qu.: 4.720 1st Qu.: 2.910
Median : 5.710 Median : 3.530
Mean : 5.735 Mean : 3.539
3rd Qu.: 6.540 3rd Qu.: 4.040
Max. :58.900 Max. :31.800
#pull out needed variables in vectors
dia_carat <- diamonds$carat
dia_price <- diamonds$price
dia_depth <- diamonds$depth
dia_tbl <- diamonds$table
range(diamonds$carat)
## [1] 0.20 5.01
range(diamonds$price)
## [1] 326 18823
range(diamonds$depth)
## [1] 43 79
range(diamonds$table)
## [1] 43 95
#Break the range of carat down
carat_range <- seq(0, 5.1, by=0.2)
carat_group <- cut(dia_carat, carat_range, right=FALSE)
carat_group_tbl <- table(carat_group)
cbind(carat_group_tbl)
## carat_group_tbl
## [0,0.2) 0
## [0.2,0.4) 13092
## [0.4,0.6) 11356
## [0.6,0.8) 6679
## [0.8,1) 3753
## [1,1.2) 9905
## [1.2,1.4) 2746
## [1.4,1.6) 3066
## [1.6,1.8) 1048
## [1.8,2) 141
## [2,2.2) 1660
## [2.2,2.4) 306
## [2.4,2.6) 118
## [2.6,2.8) 30
## [2.8,3) 0
## [3,3.2) 27
## [3.2,3.4) 3
## [3.4,3.6) 2
## [3.6,3.8) 2
## [3.8,4) 0
## [4,4.2) 4
## [4.2,4.4) 0
## [4.4,4.6) 1
## [4.6,4.8) 0
## [4.8,5) 0
#Break down the prices
price_range <- seq(0,20000, by = 1000)
price_group <- cut(dia_price, price_range, right = FALSE)
price_group_tbl <- table(price_group)
cbind(price_group_tbl)
## price_group_tbl
## [0,1e+03) 14499
## [1e+03,2e+03) 9704
## [2e+03,3e+03) 6131
## [3e+03,4e+03) 4226
## [4e+03,5e+03) 4653
## [5e+03,6e+03) 3174
## [6e+03,7e+03) 2278
## [7e+03,8e+03) 1669
## [8e+03,9e+03) 1307
## [9e+03,1e+04) 1076
## [1e+04,1.1e+04) 935
## [1.1e+04,1.2e+04) 824
## [1.2e+04,1.3e+04) 702
## [1.3e+04,1.4e+04) 603
## [1.4e+04,1.5e+04) 503
## [1.5e+04,1.6e+04) 514
## [1.6e+04,1.7e+04) 424
## [1.7e+04,1.8e+04) 406
## [1.8e+04,1.9e+04) 312
## [1.9e+04,2e+04) 0
hist(dia_carat, main ="Histogram for carat", xlab = "Carat", col="green")
hist(dia_price, main ="Histogram for price", xlab = "Price", col = "blue")
hist(dia_depth, main ="Histogram for depth", xlab = "Depth", col = "brown")
hist(dia_tbl, main ="Histogram for table", xlab = "Table", col = "yellow")