#### question 1 identify the data type of each variable in mtcars
##the variables of mtcars are : mpg, disp, hp, drat,wt,qsec, vs, am,gear,carb
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(datasets)
data(mtcars)
mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
glimpse(mtcars)
## Observations: 32
## Variables: 11
## $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19....
## $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, ...
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 1...
## $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, ...
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.9...
## $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3...
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 2...
## $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, ...
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, ...
#############
### Question 2 What variables are discrete or continuous
# continuous - mpg, disp, hp, drat, wt and qsec
# discrete - cyl, am, vs, gear and carb
class(mtcars)
## [1] "data.frame"
##############
#### Question 3three variables (your selection) from mtcars using the R
##function, summary.
wt_summary <- (mtcars$wt)
wt_summary
## [1] 2.620 2.875 2.320 3.215 3.440 3.460 3.570 3.190 3.150 3.440 3.440
## [12] 4.070 3.730 3.780 5.250 5.424 5.345 2.200 1.615 1.835 2.465 3.520
## [23] 3.435 3.840 3.845 1.935 2.140 1.513 3.170 2.770 3.570 2.780
mpg_summary <- (mtcars$mpg)
mpg_summary
## [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2
## [15] 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4
## [29] 15.8 19.7 15.0 21.4
drat_summary <- (mtcars$drat)
drat_summary
## [1] 3.90 3.90 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 3.92 3.07 3.07 3.07
## [15] 2.93 3.00 3.23 4.08 4.93 4.22 3.70 2.76 3.15 3.73 3.08 4.08 4.43 3.77
## [29] 4.22 3.62 3.54 4.11
########################
### Question 4 data type of the following three variables in esoph,agegp, alcgp, and tobgp.
library(datasets)
data(esoph)
summary(esoph)
## agegp alcgp tobgp ncases ncontrols
## 25-34:15 0-39g/day:23 0-9g/day:24 Min. : 0.000 Min. : 1.00
## 35-44:15 40-79 :23 10-19 :24 1st Qu.: 0.000 1st Qu.: 3.00
## 45-54:16 80-119 :21 20-29 :20 Median : 1.000 Median : 6.00
## 55-64:16 120+ :21 30+ :20 Mean : 2.273 Mean :11.08
## 65-74:15 3rd Qu.: 4.000 3rd Qu.:14.00
## 75+ :11 Max. :17.000 Max. :60.00
glimpse(esoph)
## Observations: 88
## Variables: 5
## $ agegp <ord> 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-34, 25-...
## $ alcgp <ord> 0-39g/day, 0-39g/day, 0-39g/day, 0-39g/day, 40-79, 4...
## $ tobgp <ord> 0-9g/day, 10-19, 20-29, 30+, 0-9g/day, 10-19, 20-29,...
## $ ncases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0...
## $ ncontrols <dbl> 40, 10, 6, 5, 27, 7, 4, 7, 2, 1, 2, 1, 1, 1, 2, 60, ...
##agegp = age group, alcgp = alcohol group, tobgp = tobacco group
#######################
### Question 5 frequency and relative frequency distributions of agegp, alcgp, and
#tobgp in esoph.
esoph_df <-tbl_df(esoph)
eso.freq <- table(esoph$agegp)
eso.freq
##
## 25-34 35-44 45-54 55-64 65-74 75+
## 15 15 16 16 15 11
eso.freq <- table(esoph$alcgp)
eso.freq
##
## 0-39g/day 40-79 80-119 120+
## 23 23 21 21
eso.freq <- table(esoph$tobgp)
eso.freq
##
## 0-9g/day 10-19 20-29 30+
## 24 24 20 20
eso.relfreq <- eso.freq / nrow(esoph)
eso.relfreq
##
## 0-9g/day 10-19 20-29 30+
## 0.2727273 0.2727273 0.2272727 0.2272727
round.eso.relfreq <- round (eso.relfreq, digits = 2)
round.eso.relfreq
##
## 0-9g/day 10-19 20-29 30+
## 0.27 0.27 0.23 0.23
cbind(round.eso.relfreq)
## round.eso.relfreq
## 0-9g/day 0.27
## 10-19 0.27
## 20-29 0.23
## 30+ 0.23
######################
### Quetion 6 joint frequency of agegp and alcgp as well as alcgp and tobgp in
# esoph
library(dplyr)
library(ggplot2)
crosstab <- table(esoph$agegp, esoph$tobgp)
crosstab
##
## 0-9g/day 10-19 20-29 30+
## 25-34 4 4 3 4
## 35-44 4 4 4 3
## 45-54 4 4 4 4
## 55-64 4 4 4 4
## 65-74 4 4 4 3
## 75+ 4 4 1 2
crosstab2 <- table(esoph$agegp, esoph$alcgp)
crosstab2
##
## 0-39g/day 40-79 80-119 120+
## 25-34 4 4 3 4
## 35-44 4 4 4 3
## 45-54 4 4 4 4
## 55-64 4 4 4 4
## 65-74 4 3 4 4
## 75+ 3 4 2 2
ftable(crosstab2)
## 0-39g/day 40-79 80-119 120+
##
## 25-34 4 4 3 4
## 35-44 4 4 4 3
## 45-54 4 4 4 4
## 55-64 4 4 4 4
## 65-74 4 3 4 4
## 75+ 3 4 2 2
table(esoph$agegp, esoph$alcgp)
##
## 0-39g/day 40-79 80-119 120+
## 25-34 4 4 3 4
## 35-44 4 4 4 3
## 45-54 4 4 4 4
## 55-64 4 4 4 4
## 65-74 4 3 4 4
## 75+ 3 4 2 2
#######################
###Quetion 7 Load the dataset, diamonds, a dataset built into R as long as you have the package,
library(ggplot2)
library(datasets)
data("diamonds")
diamonds
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
## # ... with 53,930 more rows
######################
#.7A Using R code, display the range of the following variables: price, carat,depth, and table.
diaprice <- diamonds$price
range(diaprice)
## [1] 326 18823
diacarat <- diamonds$carat
range(diacarat)
## [1] 0.20 5.01
diadepth <- diamonds$depth
range(diacarat)
## [1] 0.20 5.01
diatab <- diamonds$table
range(diatab)
## [1] 43 95
#########################
#7B Report the grouped frequency of any two of the variables identified in 7.a.
# variables used depth and table
library(dplyr)
library(ggplot2)
depth <- diamonds$depth
range(depth)
## [1] 43 79
depth_break <- seq(0, 70, by= 30)
depth_break
## [1] 0 30 60
depth.cut <- cut(depth, depth_break, right=FALSE)
depth.freq <- table(depth.cut)
depth.freq
## depth.cut
## [0,30) [30,60)
## 0 5114
cbind(depth.freq)
## depth.freq
## [0,30) 0
## [30,60) 5114
library(ggplot2)
library(dplyr)
carat <- diamonds$carat
range(carat)
## [1] 0.20 5.01
carat_break <- seq(0, 5.1, by= 1.0)
carat_break
## [1] 0 1 2 3 4 5
carat.cut <- cut(carat, carat_break, right=FALSE)
carat.freq <- table(carat.cut)
carat.freq
## carat.cut
## [0,1) [1,2) [2,3) [3,4) [4,5)
## 34880 16906 2114 34 5
cbind(carat.freq)
## carat.freq
## [0,1) 34880
## [1,2) 16906
## [2,3) 2114
## [3,4) 34
## [4,5) 5
#######################
######Quetion 7C histograms for vaiables in 7A
hist(diaprice)

hist(diatab)

hist(diacarat)

hist(diadepth)
