ls()
## character(0)
rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 291320  7.8     592000 15.9   391619 10.5
## Vcells 333507  2.6     786432  6.0   692009  5.3
#memory.size() windows specific
#memory.limit() windows specific
# install.packages(ggplot2)
library(ggplot2)
data(diamonds)
names(diamonds)
##  [1] "carat"   "cut"     "color"   "clarity" "depth"   "table"   "price"  
##  [8] "x"       "y"       "z"
class(diamonds)
## [1] "data.frame"
dim(diamonds)
## [1] 53940    10
nrow(diamonds)
## [1] 53940
ncol(diamonds)
## [1] 10
str(diamonds)
## 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
#data inspection
head(diamonds)
##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
head(diamonds$carat)
## [1] 0.23 0.21 0.23 0.29 0.31 0.24
diamonds[3,]
##   carat  cut color clarity depth table price    x    y    z
## 3  0.23 Good     E     VS1  56.9    65   327 4.05 4.07 2.31
head(diamonds[,3],10)
##  [1] E E E I J J I H E H
## Levels: D < E < F < G < H < I < J
tail(diamonds)
##       carat       cut color clarity depth table price    x    y    z
## 53935  0.72   Premium     D     SI1  62.7    59  2757 5.69 5.73 3.58
## 53936  0.72     Ideal     D     SI1  60.8    57  2757 5.75 5.76 3.50
## 53937  0.72      Good     D     SI1  63.1    55  2757 5.69 5.75 3.61
## 53938  0.70 Very Good     D     SI1  62.8    60  2757 5.66 5.68 3.56
## 53939  0.86   Premium     H     SI2  61.0    58  2757 6.15 6.12 3.74
## 53940  0.75     Ideal     D     SI2  62.2    55  2757 5.83 5.87 3.64
#missing value treatment
head(na.omit(diamonds))
##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
head(mean(diamonds$price,na.rm=T))
## [1] 3932.8
head(is.na(diamonds$price))
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
naa=is.na(diamonds$price)
table(naa)
## naa
## FALSE 
## 53940
#random sample
sample(10,3,T)
## [1] 10  2  3
sample(10,5,F)
## [1]  4  6 10  9  7
rnorm(10,5,9)
##  [1] -5.323364 10.778377  9.562595 25.202856 14.740653  6.146948  4.712989
##  [8] 10.033625 13.576019 -2.904901
sample(53940,54,F)
##  [1] 16015  3295 30810  6128 13020 17596 26258 13724 18290 49344 28803
## [12] 19296 30969 18564 47637 19556 36676 31809 27470 40427 13827 19285
## [23] 34027 19775  3914 42903 38505 22783 22571 24783 53796 49667 35219
## [34] 43201 41791 47455 22991   900 32144 27631 28891 40676 12270  9432
## [45] 25694  4886 20734  8846 28651  6460 33818  7642 21005 15168
sample(nrow(diamonds),0.001*(nrow(diamonds)),F)
##  [1] 40131 24733 29152  4190  4079 31014 23678 20268 36029 37565 26792
## [12] 52564 31527 22044  4255 39092 40824 35166  4566 20044  8307 21015
## [23] 42075 36046 41057 20671 28080  5624 31169 49728 48181 17372 26373
## [34] 11403 37404  8279 25680 15130 23026 43130 43979 10054 43876 16751
## [45]  8193 25554 42141  3124 29700 45469 53186 25642 33776
a=nrow(diamonds)
sample(a,0.0001*a,F)
## [1] 40676 32758 43476 47130 38664
randomrows=sample(a,0.0001*a,F)
diamonds[randomrows,]
##       carat       cut color clarity depth table price    x    y    z
## 29067  0.40      Good     F     SI1  63.1    58   687 4.66 4.69 2.95
## 28304  0.32 Very Good     I     SI1  62.8    58   432 4.34 4.39 2.74
## 25301  1.58 Very Good     G     VS1  62.8    57 13963 7.34 7.40 4.63
## 15670  1.00      Fair     E     VS2  57.3    64  6285 6.59 6.46 3.79
## 5267   1.03      Good     J     VS2  63.7    56  3795 6.42 6.35 4.07
#Descriptive Stats
summary(diamonds)
##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
## 
table(diamonds$cut)
## 
##      Fair      Good Very Good   Premium     Ideal 
##      1610      4906     12082     13791     21551
table(diamonds$cut,diamonds$color)
##            
##                D    E    F    G    H    I    J
##   Fair       163  224  312  314  303  175  119
##   Good       662  933  909  871  702  522  307
##   Very Good 1513 2400 2164 2299 1824 1204  678
##   Premium   1603 2337 2331 2924 2360 1428  808
##   Ideal     2834 3903 3826 4884 3115 2093  896
table(diamonds$cut,diamonds$color,diamonds$clarity)
## , ,  = I1
## 
##            
##                D    E    F    G    H    I    J
##   Fair         4    9   35   53   52   34   23
##   Good         8   23   19   19   14    9    4
##   Very Good    5   22   13   16   12    8    8
##   Premium     12   30   34   46   46   24   13
##   Ideal       13   18   42   16   38   17    2
## 
## , ,  = SI2
## 
##            
##                D    E    F    G    H    I    J
##   Fair        56   78   89   80   91   45   27
##   Good       223  202  201  163  158   81   53
##   Very Good  314  445  343  327  343  200  128
##   Premium    421  519  523  492  521  312  161
##   Ideal      356  469  453  486  450  274  110
## 
## , ,  = SI1
## 
##            
##                D    E    F    G    H    I    J
##   Fair        58   65   83   69   75   30   28
##   Good       237  355  273  207  235  165   88
##   Very Good  494  626  559  474  547  358  182
##   Premium    556  614  608  566  655  367  209
##   Ideal      738  766  608  660  763  504  243
## 
## , ,  = VS2
## 
##            
##                D    E    F    G    H    I    J
##   Fair        25   42   53   45   41   32   23
##   Good       104  160  184  192  138  110   90
##   Very Good  309  503  466  479  376  274  184
##   Premium    339  629  619  721  532  315  202
##   Ideal      920 1136  879  910  556  438  232
## 
## , ,  = VS1
## 
##            
##                D    E    F    G    H    I    J
##   Fair         5   14   33   45   32   25   16
##   Good        43   89  132  152   77  103   52
##   Very Good  175  293  293  432  257  205  120
##   Premium    131  292  290  566  336  221  153
##   Ideal      351  593  616  953  467  408  201
## 
## , ,  = VVS2
## 
##            
##                D    E    F    G    H    I    J
##   Fair         9   13   10   17   11    8    1
##   Good        25   52   50   75   45   26   13
##   Very Good  141  298  249  302  145   71   29
##   Premium     94  121  146  275  118   82   34
##   Ideal      284  507  520  774  289  178   54
## 
## , ,  = VVS1
## 
##            
##                D    E    F    G    H    I    J
##   Fair         3    3    5    3    1    1    1
##   Good        13   43   35   41   31   22    1
##   Very Good   52  170  174  190  115   69   19
##   Premium     40  105   80  171  112   84   24
##   Ideal      144  335  440  594  326  179   29
## 
## , ,  = IF
## 
##            
##                D    E    F    G    H    I    J
##   Fair         3    0    4    2    0    0    0
##   Good         9    9   15   22    4    6    6
##   Very Good   23   43   67   79   29   19    8
##   Premium     10   27   31   87   40   23   12
##   Ideal       28   79  268  491  226   95   25
mean(diamonds$price)
## [1] 3932.8
#using Hmisc
library(Hmisc)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
describe(diamonds$price)
## diamonds$price 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##   53940       0   11602       1    3933     544     646     950    2401 
##     .75     .90     .95 
##    5324    9821   13107 
## 
## lowest :   326   327   334   335   336
## highest: 18803 18804 18806 18818 18823
summarize(diamonds$price,diamonds$color,mean)
##   diamonds$color diamonds$price
## 1              D       3169.954
## 2              E       3076.752
## 3              F       3724.886
## 4              G       3999.136
## 5              H       4486.669
## 6              I       5091.875
## 7              J       5323.818
summarize(diamonds$price,diamonds$color,max)
##   diamonds$color diamonds$price
## 1              D          18693
## 2              E          18731
## 3              F          18791
## 4              G          18818
## 5              H          18803
## 6              I          18823
## 7              J          18710
summarize(diamonds$price,llist(diamonds$color,diamonds$cut),mean)
##    diamonds$color diamonds$cut diamonds$price
## 1               D         Fair       4291.061
## 2               D         Good       3405.382
## 5               D    Very Good       3470.467
## 4               D      Premium       3631.293
## 3               D        Ideal       2629.095
## 6               E         Fair       3682.312
## 7               E         Good       3423.644
## 10              E    Very Good       3214.652
## 9               E      Premium       3538.914
## 8               E        Ideal       2597.550
## 11              F         Fair       3827.003
## 12              F         Good       3495.750
## 15              F    Very Good       3778.820
## 14              F      Premium       4324.890
## 13              F        Ideal       3374.939
## 16              G         Fair       4239.255
## 17              G         Good       4123.482
## 20              G    Very Good       3872.754
## 19              G      Premium       4500.742
## 18              G        Ideal       3720.706
## 21              H         Fair       5135.683
## 22              H         Good       4276.255
## 25              H    Very Good       4535.390
## 24              H      Premium       5216.707
## 23              H        Ideal       3889.335
## 26              I         Fair       4685.446
## 27              I         Good       5078.533
## 30              I    Very Good       5255.880
## 29              I      Premium       5946.181
## 28              I        Ideal       4451.970
## 31              J         Fair       4975.655
## 32              J         Good       4574.173
## 35              J    Very Good       5103.513
## 34              J      Premium       6294.592
## 33              J        Ideal       4918.186
#reshape
library(reshape2)
acast(diamonds, cut~color, value.var='price', mean)
##                  D        E        F        G        H        I        J
## Fair      4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
## Good      3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
## Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
## Premium   3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
## Ideal     2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
with(diamonds, tapply(price, list(cut,color), FUN= mean))
##                  D        E        F        G        H        I        J
## Fair      4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
## Good      3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
## Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
## Premium   3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
## Ideal     2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
xtabs(price ~ cut + color, diamonds)/table(diamonds[c('cut', 'color')])
##            color
## cut                D        E        F        G        H        I        J
##   Fair      4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
##   Good      3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
##   Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
##   Premium   3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
##   Ideal     2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
library(data.table)
dcast(as.data.table(diamonds), cut~color, value.var='price', mean)
##         cut        D        E        F        G        H        I        J
## 1      Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
## 2      Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
## 3 Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
## 4   Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
## 5     Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:data.table':
## 
##     between, last
## 
## The following objects are masked from 'package:Hmisc':
## 
##     combine, src, summarize
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
b=diamonds %>%
  group_by(cut, color) %>%
  summarise(price = mean(price)) %>%
  spread(color, price)
b
## Source: local data frame [5 x 8]
## 
##         cut        D        E        F        G        H        I        J
## 1      Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
## 2      Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
## 3 Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
## 4   Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
## 5     Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
str(b)
## Classes 'tbl_df', 'tbl' and 'data.frame':    5 obs. of  8 variables:
##  $ cut: Ord.factor w/ 5 levels "Fair"<"Good"<..: 1 2 3 4 5
##  $ D  : num  4291 3405 3470 3631 2629
##  $ E  : num  3682 3424 3215 3539 2598
##  $ F  : num  3827 3496 3779 4325 3375
##  $ G  : num  4239 4123 3873 4501 3721
##  $ H  : num  5136 4276 4535 5217 3889
##  $ I  : num  4685 5079 5256 5946 4452
##  $ J  : num  4976 4574 5104 6295 4918
image(as.matrix(b[2:7]))

#subset
cut2=diamonds[diamonds$cut=="Ideal",]
head(cut2)
##    carat   cut color clarity depth table price    x    y    z
## 1   0.23 Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 12  0.23 Ideal     J     VS1  62.8    56   340 3.93 3.90 2.46
## 14  0.31 Ideal     J     SI2  62.2    54   344 4.35 4.37 2.71
## 17  0.30 Ideal     I     SI2  62.0    54   348 4.31 4.34 2.68
## 40  0.33 Ideal     I     SI2  61.8    55   403 4.49 4.51 2.78
## 41  0.33 Ideal     I     SI2  61.2    56   403 4.49 4.50 2.75
cut3=diamonds[diamonds$cut=="Ideal" & diamonds$color=="D",]
head(cut3)
##     carat   cut color clarity depth table price    x    y    z
## 63   0.30 Ideal     D     SI1  62.5    57   552 4.29 4.32 2.69
## 64   0.30 Ideal     D     SI1  62.1    56   552 4.30 4.33 2.68
## 121  0.71 Ideal     D     SI2  62.3    56  2762 5.73 5.69 3.56
## 133  0.71 Ideal     D     SI1  61.9    59  2764 5.69 5.72 3.53
## 145  0.71 Ideal     D     SI2  61.6    55  2767 5.74 5.76 3.54
## 156  0.76 Ideal     D     SI2  62.4    57  2770 5.78 5.83 3.62
cut4=diamonds[diamonds$cut=="Ideal" | diamonds$color=="D",]
head(cut4)
##    carat       cut color clarity depth table price    x    y    z
## 1   0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 12  0.23     Ideal     J     VS1  62.8    56   340 3.93 3.90 2.46
## 14  0.31     Ideal     J     SI2  62.2    54   344 4.35 4.37 2.71
## 17  0.30     Ideal     I     SI2  62.0    54   348 4.31 4.34 2.68
## 29  0.23 Very Good     D     VS2  60.5    61   357 3.96 3.97 2.40
## 35  0.23 Very Good     D     VS1  61.9    58   402 3.92 3.96 2.44
cut5=ifelse(diamonds$price>9000,"Expensive","Not So Expensive")
table(cut5)
## cut5
##        Expensive Not So Expensive 
##             6298            47642