library(datasets)
library(plyr)
library(ggplot2)
#1 identify the data type of each variable in mtcars
data(mtcars)
(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
#2 Report your classification of each variable in mtcars by whether it is discrete or continuous.
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
#discrete= model, manuf,trans, dry,fl,class
#continuous=displ,cyl,cty,hwy
#3 Report the distribution of three variables (your selection) from mtcars using the R function, summary..
summary(mtcars$am)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4062 1.0000 1.0000
summary(mtcars$gear)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 3.000 4.000 3.688 4.000 5.000
summary(mtcars$carb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 2.000 2.812 4.000 8.000
#4 identify the datatype of three of the following in esoph
data(esoph)
str(esoph)
## 'data.frame': 88 obs. of 5 variables:
## $ agegp : Ord.factor w/ 6 levels "25-34"<"35-44"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ alcgp : Ord.factor w/ 4 levels "0-39g/day"<"40-79"<..: 1 1 1 1 2 2 2 2 3 3 ...
## $ tobgp : Ord.factor w/ 4 levels "0-9g/day"<"10-19"<..: 1 2 3 4 1 2 3 4 1 2 ...
## $ ncases : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ncontrols: num 40 10 6 5 27 7 4 7 2 1 ...
summary(esoph)
## agegp alcgp tobgp ncases ncontrols
## 25-34:15 0-39g/day:23 0-9g/day:24 Min. : 0.000 Min. : 1.00
## 35-44:15 40-79 :23 10-19 :24 1st Qu.: 0.000 1st Qu.: 3.00
## 45-54:16 80-119 :21 20-29 :20 Median : 1.000 Median : 6.00
## 55-64:16 120+ :21 30+ :20 Mean : 2.273 Mean :11.08
## 65-74:15 3rd Qu.: 4.000 3rd Qu.:14.00
## 75+ :11 Max. :17.000 Max. :60.00
#5. Report the frequency and relative frequency distributions of agegp, alcgp, and tobgp in esoph.
#frequency and relative frequency for agegp
esoph.freq<-table(esoph$agegp)
freqagegp<-table(esoph$agegp)
freqagegp
##
## 25-34 35-44 45-54 55-64 65-74 75+
## 15 15 16 16 15 11
relfreqagegp<-freqagegp/nrow(esoph)
round_relfreqagegp<-round(relfreqagegp,digits = 2)
round_relfreqagegp
##
## 25-34 35-44 45-54 55-64 65-74 75+
## 0.17 0.17 0.18 0.18 0.17 0.12
cbind(freqagegp,round_relfreqagegp)
## freqagegp round_relfreqagegp
## 25-34 15 0.17
## 35-44 15 0.17
## 45-54 16 0.18
## 55-64 16 0.18
## 65-74 15 0.17
## 75+ 11 0.12
#frequency and relative frequency for alcgp
freqalcgp<-table(esoph$alcgp)
freqalcgp
##
## 0-39g/day 40-79 80-119 120+
## 23 23 21 21
relfreqalcgp<-freqalcgp/nrow(esoph)
round_relfreqalcgp<-round(relfreqalcgp,digits = 2)
round_relfreqalcgp
##
## 0-39g/day 40-79 80-119 120+
## 0.26 0.26 0.24 0.24
cbind(freqalcgp,round_relfreqalcgp)
## freqalcgp round_relfreqalcgp
## 0-39g/day 23 0.26
## 40-79 23 0.26
## 80-119 21 0.24
## 120+ 21 0.24
#frequency and relative frequency for tobgp
freqtobgp<-table(esoph$tobgp)
freqtobgp
##
## 0-9g/day 10-19 20-29 30+
## 24 24 20 20
relfreqtobgp<-freqtobgp/nrow(esoph)
round_relfreqtobgp<-round(relfreqtobgp,digits = 2)
round_relfreqtobgp
##
## 0-9g/day 10-19 20-29 30+
## 0.27 0.27 0.23 0.23
cbind(freqtobgp,round_relfreqtobgp)
## freqtobgp round_relfreqtobgp
## 0-9g/day 24 0.27
## 10-19 24 0.27
## 20-29 20 0.23
## 30+ 20 0.23
#6 Report the joint frequency of agegp and alcgp as well as alcgp and tobgp in esoph.
x=xtabs(~agegp+alcgp,esoph)
x
## alcgp
## agegp 0-39g/day 40-79 80-119 120+
## 25-34 4 4 3 4
## 35-44 4 4 4 3
## 45-54 4 4 4 4
## 55-64 4 4 4 4
## 65-74 4 3 4 4
## 75+ 3 4 2 2
y=xtabs(~alcgp+tobgp,esoph)
y
## tobgp
## alcgp 0-9g/day 10-19 20-29 30+
## 0-39g/day 6 6 5 6
## 40-79 6 6 6 5
## 80-119 6 6 4 5
## 120+ 6 6 5 4
# 7 load the dataset, diamonds,
library(ggplot2)
#a display the range of the following variable: price, carat, depth and table
data("diamonds")
diamonds
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
## # ... with 53,930 more rows
#b report the group frequency of any two of the variables identfied in 7.a
duration=diamonds$price
range(duration)
## [1] 326 18823
breaks=seq(326,18826, by=2312.5)
breaks
## [1] 326.0 2638.5 4951.0 7263.5 9576.0 11888.5 14201.0 16513.5 18826.0
duration.cut=cut(duration,breaks, right = FALSE)
duration.freq=table(duration.cut)
duration.freq
## duration.cut
## [326,2.64e+03) [2.64e+03,4.95e+03) [4.95e+03,7.26e+03)
## 28353 10660 6137
## [7.26e+03,9.58e+03) [9.58e+03,1.19e+04) [1.19e+04,1.42e+04)
## 3148 2107 1474
## [1.42e+04,1.65e+04) [1.65e+04,1.88e+04)
## 1138 923
duration=diamonds$carat
range(duration)
## [1] 0.20 5.01
breaks=seq(0.15,5.01, by=.10)
breaks
## [1] 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 1.15 1.25 1.35 1.45
## [15] 1.55 1.65 1.75 1.85 1.95 2.05 2.15 2.25 2.35 2.45 2.55 2.65 2.75 2.85
## [29] 2.95 3.05 3.15 3.25 3.35 3.45 3.55 3.65 3.75 3.85 3.95 4.05 4.15 4.25
## [43] 4.35 4.45 4.55 4.65 4.75 4.85 4.95
duration.cut=cut(duration,breaks, right = FALSE)
duration.freq=table(duration.cut)
duration.freq
## duration.cut
## [0.15,0.25) [0.25,0.35) [0.35,0.45) [0.45,0.55) [0.55,0.65) [0.65,0.75)
## 573 9818 6898 4921 2759 5314
## [0.75,0.85) [0.85,0.95) [0.95,1.05) [1.05,1.15) [1.15,1.25) [1.25,1.35)
## 1725 2656 5897 2899 2613 1173
## [1.35,1.45) [1.45,1.55) [1.55,1.65) [1.65,1.75) [1.75,1.85) [1.85,1.95)
## 387 2447 830 613 187 57
## [1.95,2.05) [2.05,2.15) [2.15,2.25) [2.25,2.35) [2.35,2.45) [2.45,2.55)
## 1176 385 229 146 72 85
## [2.55,2.65) [2.65,2.75) [2.75,2.85) [2.85,2.95) [2.95,3.05) [3.05,3.15)
## 20 15 5 0 26 1
## [3.15,3.25) [3.25,3.35) [3.35,3.45) [3.45,3.55) [3.55,3.65) [3.65,3.75)
## 2 0 1 2 0 2
## [3.75,3.85) [3.85,3.95) [3.95,4.05) [4.05,4.15) [4.15,4.25) [4.25,4.35)
## 0 0 3 1 0 0
## [4.35,4.45) [4.45,4.55) [4.55,4.65) [4.65,4.75) [4.75,4.85) [4.85,4.95)
## 0 1 0 0 0 0
#7c plot histogram for all four variables identfied in 7.a
hist(diamonds$price,col ="red")

hist(diamonds$carat,col ="gold")

hist(diamonds$depth,col ="green")

hist(diamonds$table,col = "blue")
