library(datasets)
library(plyr)
library(ggplot2)
#1  identify the data type of each variable in mtcars
data(mtcars)
(mtcars)
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
#2 Report your classification of each variable in mtcars by whether it is discrete or continuous. 
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
#discrete= model, manuf,trans, dry,fl,class
#continuous=displ,cyl,cty,hwy


#3 Report the distribution of three variables (your selection) from mtcars using the R function, summary.. 
summary(mtcars$am)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4062  1.0000  1.0000
summary(mtcars$gear)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   3.000   4.000   3.688   4.000   5.000
summary(mtcars$carb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   2.812   4.000   8.000
#4 identify the datatype of three of the following in esoph 
data(esoph)
str(esoph)
## 'data.frame':    88 obs. of  5 variables:
##  $ agegp    : Ord.factor w/ 6 levels "25-34"<"35-44"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ alcgp    : Ord.factor w/ 4 levels "0-39g/day"<"40-79"<..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ tobgp    : Ord.factor w/ 4 levels "0-9g/day"<"10-19"<..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ ncases   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ncontrols: num  40 10 6 5 27 7 4 7 2 1 ...
summary(esoph)
##    agegp          alcgp         tobgp        ncases         ncontrols    
##  25-34:15   0-39g/day:23   0-9g/day:24   Min.   : 0.000   Min.   : 1.00  
##  35-44:15   40-79    :23   10-19   :24   1st Qu.: 0.000   1st Qu.: 3.00  
##  45-54:16   80-119   :21   20-29   :20   Median : 1.000   Median : 6.00  
##  55-64:16   120+     :21   30+     :20   Mean   : 2.273   Mean   :11.08  
##  65-74:15                                3rd Qu.: 4.000   3rd Qu.:14.00  
##  75+  :11                                Max.   :17.000   Max.   :60.00
#5. Report the frequency and relative frequency distributions of agegp, alcgp, and tobgp in esoph. 

  #frequency and relative frequency for agegp
esoph.freq<-table(esoph$agegp)
freqagegp<-table(esoph$agegp)
freqagegp
## 
## 25-34 35-44 45-54 55-64 65-74   75+ 
##    15    15    16    16    15    11
relfreqagegp<-freqagegp/nrow(esoph)
round_relfreqagegp<-round(relfreqagegp,digits = 2)
round_relfreqagegp
## 
## 25-34 35-44 45-54 55-64 65-74   75+ 
##  0.17  0.17  0.18  0.18  0.17  0.12
cbind(freqagegp,round_relfreqagegp)
##       freqagegp round_relfreqagegp
## 25-34        15               0.17
## 35-44        15               0.17
## 45-54        16               0.18
## 55-64        16               0.18
## 65-74        15               0.17
## 75+          11               0.12
#frequency and relative frequency for alcgp

freqalcgp<-table(esoph$alcgp)
freqalcgp
## 
## 0-39g/day     40-79    80-119      120+ 
##        23        23        21        21
relfreqalcgp<-freqalcgp/nrow(esoph)
round_relfreqalcgp<-round(relfreqalcgp,digits = 2)
round_relfreqalcgp
## 
## 0-39g/day     40-79    80-119      120+ 
##      0.26      0.26      0.24      0.24
cbind(freqalcgp,round_relfreqalcgp)
##           freqalcgp round_relfreqalcgp
## 0-39g/day        23               0.26
## 40-79            23               0.26
## 80-119           21               0.24
## 120+             21               0.24
#frequency and relative frequency for tobgp


freqtobgp<-table(esoph$tobgp)
freqtobgp
## 
## 0-9g/day    10-19    20-29      30+ 
##       24       24       20       20
relfreqtobgp<-freqtobgp/nrow(esoph)
round_relfreqtobgp<-round(relfreqtobgp,digits = 2)
round_relfreqtobgp
## 
## 0-9g/day    10-19    20-29      30+ 
##     0.27     0.27     0.23     0.23
cbind(freqtobgp,round_relfreqtobgp)
##          freqtobgp round_relfreqtobgp
## 0-9g/day        24               0.27
## 10-19           24               0.27
## 20-29           20               0.23
## 30+             20               0.23
#6 Report the joint frequency of agegp and alcgp as well as alcgp and tobgp in esoph. 

x=xtabs(~agegp+alcgp,esoph)
x
##        alcgp
## agegp   0-39g/day 40-79 80-119 120+
##   25-34         4     4      3    4
##   35-44         4     4      4    3
##   45-54         4     4      4    4
##   55-64         4     4      4    4
##   65-74         4     3      4    4
##   75+           3     4      2    2
y=xtabs(~alcgp+tobgp,esoph)
y
##            tobgp
## alcgp       0-9g/day 10-19 20-29 30+
##   0-39g/day        6     6     5   6
##   40-79            6     6     6   5
##   80-119           6     6     4   5
##   120+             6     6     5   4
# 7 load the dataset, diamonds,

library(ggplot2)

#a display the range of the following variable: price, carat, depth and table
data("diamonds")
diamonds
## # A tibble: 53,940 x 10
##    carat       cut color clarity depth table price     x     y     z
##    <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
##  2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
##  3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
##  4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
##  5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
##  6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
##  7  0.24 Very Good     I    VVS1  62.3    57   336  3.95  3.98  2.47
##  8  0.26 Very Good     H     SI1  61.9    55   337  4.07  4.11  2.53
##  9  0.22      Fair     E     VS2  65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good     H     VS1  59.4    61   338  4.00  4.05  2.39
## # ... with 53,930 more rows
#b report the group frequency of any two of the variables identfied in 7.a
   duration=diamonds$price
      range(duration)
## [1]   326 18823
      breaks=seq(326,18826, by=2312.5)
      breaks
## [1]   326.0  2638.5  4951.0  7263.5  9576.0 11888.5 14201.0 16513.5 18826.0
      duration.cut=cut(duration,breaks, right = FALSE)
      duration.freq=table(duration.cut)
      duration.freq
## duration.cut
##      [326,2.64e+03) [2.64e+03,4.95e+03) [4.95e+03,7.26e+03) 
##               28353               10660                6137 
## [7.26e+03,9.58e+03) [9.58e+03,1.19e+04) [1.19e+04,1.42e+04) 
##                3148                2107                1474 
## [1.42e+04,1.65e+04) [1.65e+04,1.88e+04) 
##                1138                 923
      duration=diamonds$carat
      range(duration)
## [1] 0.20 5.01
      breaks=seq(0.15,5.01, by=.10)
      breaks
##  [1] 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 1.15 1.25 1.35 1.45
## [15] 1.55 1.65 1.75 1.85 1.95 2.05 2.15 2.25 2.35 2.45 2.55 2.65 2.75 2.85
## [29] 2.95 3.05 3.15 3.25 3.35 3.45 3.55 3.65 3.75 3.85 3.95 4.05 4.15 4.25
## [43] 4.35 4.45 4.55 4.65 4.75 4.85 4.95
      duration.cut=cut(duration,breaks, right = FALSE)
      duration.freq=table(duration.cut)
      duration.freq
## duration.cut
## [0.15,0.25) [0.25,0.35) [0.35,0.45) [0.45,0.55) [0.55,0.65) [0.65,0.75) 
##         573        9818        6898        4921        2759        5314 
## [0.75,0.85) [0.85,0.95) [0.95,1.05) [1.05,1.15) [1.15,1.25) [1.25,1.35) 
##        1725        2656        5897        2899        2613        1173 
## [1.35,1.45) [1.45,1.55) [1.55,1.65) [1.65,1.75) [1.75,1.85) [1.85,1.95) 
##         387        2447         830         613         187          57 
## [1.95,2.05) [2.05,2.15) [2.15,2.25) [2.25,2.35) [2.35,2.45) [2.45,2.55) 
##        1176         385         229         146          72          85 
## [2.55,2.65) [2.65,2.75) [2.75,2.85) [2.85,2.95) [2.95,3.05) [3.05,3.15) 
##          20          15           5           0          26           1 
## [3.15,3.25) [3.25,3.35) [3.35,3.45) [3.45,3.55) [3.55,3.65) [3.65,3.75) 
##           2           0           1           2           0           2 
## [3.75,3.85) [3.85,3.95) [3.95,4.05) [4.05,4.15) [4.15,4.25) [4.25,4.35) 
##           0           0           3           1           0           0 
## [4.35,4.45) [4.45,4.55) [4.55,4.65) [4.65,4.75) [4.75,4.85) [4.85,4.95) 
##           0           1           0           0           0           0
    #7c plot histogram for all four variables identfied in 7.a
      
      
      hist(diamonds$price,col ="red")

      hist(diamonds$carat,col ="gold")

      hist(diamonds$depth,col ="green")

      hist(diamonds$table,col = "blue")