ls()
## character(0)
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 291320 7.8 592000 15.9 391619 10.5
## Vcells 333507 2.6 786432 6.0 692009 5.3
#memory.size() windows specific
#memory.limit() windows specific
# install.packages(ggplot2)
library(ggplot2)
data(diamonds)
names(diamonds)
## [1] "carat" "cut" "color" "clarity" "depth" "table" "price"
## [8] "x" "y" "z"
class(diamonds)
## [1] "data.frame"
dim(diamonds)
## [1] 53940 10
nrow(diamonds)
## [1] 53940
ncol(diamonds)
## [1] 10
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
#data inspection
head(diamonds)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
head(diamonds$carat)
## [1] 0.23 0.21 0.23 0.29 0.31 0.24
diamonds[3,]
## carat cut color clarity depth table price x y z
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
head(diamonds[,3],10)
## [1] E E E I J J I H E H
## Levels: D < E < F < G < H < I < J
tail(diamonds)
## carat cut color clarity depth table price x y z
## 53935 0.72 Premium D SI1 62.7 59 2757 5.69 5.73 3.58
## 53936 0.72 Ideal D SI1 60.8 57 2757 5.75 5.76 3.50
## 53937 0.72 Good D SI1 63.1 55 2757 5.69 5.75 3.61
## 53938 0.70 Very Good D SI1 62.8 60 2757 5.66 5.68 3.56
## 53939 0.86 Premium H SI2 61.0 58 2757 6.15 6.12 3.74
## 53940 0.75 Ideal D SI2 62.2 55 2757 5.83 5.87 3.64
#missing value treatment
head(na.omit(diamonds))
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
head(mean(diamonds$price,na.rm=T))
## [1] 3932.8
head(is.na(diamonds$price))
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
naa=is.na(diamonds$price)
table(naa)
## naa
## FALSE
## 53940
#random sample
sample(10,3,T)
## [1] 10 2 3
sample(10,5,F)
## [1] 4 6 10 9 7
rnorm(10,5,9)
## [1] -5.323364 10.778377 9.562595 25.202856 14.740653 6.146948 4.712989
## [8] 10.033625 13.576019 -2.904901
sample(53940,54,F)
## [1] 16015 3295 30810 6128 13020 17596 26258 13724 18290 49344 28803
## [12] 19296 30969 18564 47637 19556 36676 31809 27470 40427 13827 19285
## [23] 34027 19775 3914 42903 38505 22783 22571 24783 53796 49667 35219
## [34] 43201 41791 47455 22991 900 32144 27631 28891 40676 12270 9432
## [45] 25694 4886 20734 8846 28651 6460 33818 7642 21005 15168
sample(nrow(diamonds),0.001*(nrow(diamonds)),F)
## [1] 40131 24733 29152 4190 4079 31014 23678 20268 36029 37565 26792
## [12] 52564 31527 22044 4255 39092 40824 35166 4566 20044 8307 21015
## [23] 42075 36046 41057 20671 28080 5624 31169 49728 48181 17372 26373
## [34] 11403 37404 8279 25680 15130 23026 43130 43979 10054 43876 16751
## [45] 8193 25554 42141 3124 29700 45469 53186 25642 33776
a=nrow(diamonds)
sample(a,0.0001*a,F)
## [1] 40676 32758 43476 47130 38664
randomrows=sample(a,0.0001*a,F)
diamonds[randomrows,]
## carat cut color clarity depth table price x y z
## 29067 0.40 Good F SI1 63.1 58 687 4.66 4.69 2.95
## 28304 0.32 Very Good I SI1 62.8 58 432 4.34 4.39 2.74
## 25301 1.58 Very Good G VS1 62.8 57 13963 7.34 7.40 4.63
## 15670 1.00 Fair E VS2 57.3 64 6285 6.59 6.46 3.79
## 5267 1.03 Good J VS2 63.7 56 3795 6.42 6.35 4.07
#Descriptive Stats
summary(diamonds)
## carat cut color clarity
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.0100 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
##
## y z
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.710 Median : 3.530
## Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :58.900 Max. :31.800
##
table(diamonds$cut)
##
## Fair Good Very Good Premium Ideal
## 1610 4906 12082 13791 21551
table(diamonds$cut,diamonds$color)
##
## D E F G H I J
## Fair 163 224 312 314 303 175 119
## Good 662 933 909 871 702 522 307
## Very Good 1513 2400 2164 2299 1824 1204 678
## Premium 1603 2337 2331 2924 2360 1428 808
## Ideal 2834 3903 3826 4884 3115 2093 896
table(diamonds$cut,diamonds$color,diamonds$clarity)
## , , = I1
##
##
## D E F G H I J
## Fair 4 9 35 53 52 34 23
## Good 8 23 19 19 14 9 4
## Very Good 5 22 13 16 12 8 8
## Premium 12 30 34 46 46 24 13
## Ideal 13 18 42 16 38 17 2
##
## , , = SI2
##
##
## D E F G H I J
## Fair 56 78 89 80 91 45 27
## Good 223 202 201 163 158 81 53
## Very Good 314 445 343 327 343 200 128
## Premium 421 519 523 492 521 312 161
## Ideal 356 469 453 486 450 274 110
##
## , , = SI1
##
##
## D E F G H I J
## Fair 58 65 83 69 75 30 28
## Good 237 355 273 207 235 165 88
## Very Good 494 626 559 474 547 358 182
## Premium 556 614 608 566 655 367 209
## Ideal 738 766 608 660 763 504 243
##
## , , = VS2
##
##
## D E F G H I J
## Fair 25 42 53 45 41 32 23
## Good 104 160 184 192 138 110 90
## Very Good 309 503 466 479 376 274 184
## Premium 339 629 619 721 532 315 202
## Ideal 920 1136 879 910 556 438 232
##
## , , = VS1
##
##
## D E F G H I J
## Fair 5 14 33 45 32 25 16
## Good 43 89 132 152 77 103 52
## Very Good 175 293 293 432 257 205 120
## Premium 131 292 290 566 336 221 153
## Ideal 351 593 616 953 467 408 201
##
## , , = VVS2
##
##
## D E F G H I J
## Fair 9 13 10 17 11 8 1
## Good 25 52 50 75 45 26 13
## Very Good 141 298 249 302 145 71 29
## Premium 94 121 146 275 118 82 34
## Ideal 284 507 520 774 289 178 54
##
## , , = VVS1
##
##
## D E F G H I J
## Fair 3 3 5 3 1 1 1
## Good 13 43 35 41 31 22 1
## Very Good 52 170 174 190 115 69 19
## Premium 40 105 80 171 112 84 24
## Ideal 144 335 440 594 326 179 29
##
## , , = IF
##
##
## D E F G H I J
## Fair 3 0 4 2 0 0 0
## Good 9 9 15 22 4 6 6
## Very Good 23 43 67 79 29 19 8
## Premium 10 27 31 87 40 23 12
## Ideal 28 79 268 491 226 95 25
mean(diamonds$price)
## [1] 3932.8
#using Hmisc
library(Hmisc)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
describe(diamonds$price)
## diamonds$price
## n missing unique Info Mean .05 .10 .25 .50
## 53940 0 11602 1 3933 544 646 950 2401
## .75 .90 .95
## 5324 9821 13107
##
## lowest : 326 327 334 335 336
## highest: 18803 18804 18806 18818 18823
summarize(diamonds$price,diamonds$color,mean)
## diamonds$color diamonds$price
## 1 D 3169.954
## 2 E 3076.752
## 3 F 3724.886
## 4 G 3999.136
## 5 H 4486.669
## 6 I 5091.875
## 7 J 5323.818
summarize(diamonds$price,diamonds$color,max)
## diamonds$color diamonds$price
## 1 D 18693
## 2 E 18731
## 3 F 18791
## 4 G 18818
## 5 H 18803
## 6 I 18823
## 7 J 18710
summarize(diamonds$price,llist(diamonds$color,diamonds$cut),mean)
## diamonds$color diamonds$cut diamonds$price
## 1 D Fair 4291.061
## 2 D Good 3405.382
## 5 D Very Good 3470.467
## 4 D Premium 3631.293
## 3 D Ideal 2629.095
## 6 E Fair 3682.312
## 7 E Good 3423.644
## 10 E Very Good 3214.652
## 9 E Premium 3538.914
## 8 E Ideal 2597.550
## 11 F Fair 3827.003
## 12 F Good 3495.750
## 15 F Very Good 3778.820
## 14 F Premium 4324.890
## 13 F Ideal 3374.939
## 16 G Fair 4239.255
## 17 G Good 4123.482
## 20 G Very Good 3872.754
## 19 G Premium 4500.742
## 18 G Ideal 3720.706
## 21 H Fair 5135.683
## 22 H Good 4276.255
## 25 H Very Good 4535.390
## 24 H Premium 5216.707
## 23 H Ideal 3889.335
## 26 I Fair 4685.446
## 27 I Good 5078.533
## 30 I Very Good 5255.880
## 29 I Premium 5946.181
## 28 I Ideal 4451.970
## 31 J Fair 4975.655
## 32 J Good 4574.173
## 35 J Very Good 5103.513
## 34 J Premium 6294.592
## 33 J Ideal 4918.186
#reshape
library(reshape2)
acast(diamonds, cut~color, value.var='price', mean)
## D E F G H I J
## Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
## Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
## Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
## Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
## Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
with(diamonds, tapply(price, list(cut,color), FUN= mean))
## D E F G H I J
## Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
## Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
## Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
## Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
## Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
xtabs(price ~ cut + color, diamonds)/table(diamonds[c('cut', 'color')])
## color
## cut D E F G H I J
## Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
## Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
## Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
## Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
## Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
library(data.table)
dcast(as.data.table(diamonds), cut~color, value.var='price', mean)
## cut D E F G H I J
## 1 Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
## 2 Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
## 3 Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
## 4 Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
## 5 Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:data.table':
##
## between, last
##
## The following objects are masked from 'package:Hmisc':
##
## combine, src, summarize
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
b=diamonds %>%
group_by(cut, color) %>%
summarise(price = mean(price)) %>%
spread(color, price)
b
## Source: local data frame [5 x 8]
##
## cut D E F G H I J
## 1 Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
## 2 Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
## 3 Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
## 4 Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
## 5 Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
str(b)
## Classes 'tbl_df', 'tbl' and 'data.frame': 5 obs. of 8 variables:
## $ cut: Ord.factor w/ 5 levels "Fair"<"Good"<..: 1 2 3 4 5
## $ D : num 4291 3405 3470 3631 2629
## $ E : num 3682 3424 3215 3539 2598
## $ F : num 3827 3496 3779 4325 3375
## $ G : num 4239 4123 3873 4501 3721
## $ H : num 5136 4276 4535 5217 3889
## $ I : num 4685 5079 5256 5946 4452
## $ J : num 4976 4574 5104 6295 4918
image(as.matrix(b[2:7]))

#subset
cut2=diamonds[diamonds$cut=="Ideal",]
head(cut2)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 12 0.23 Ideal J VS1 62.8 56 340 3.93 3.90 2.46
## 14 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 17 0.30 Ideal I SI2 62.0 54 348 4.31 4.34 2.68
## 40 0.33 Ideal I SI2 61.8 55 403 4.49 4.51 2.78
## 41 0.33 Ideal I SI2 61.2 56 403 4.49 4.50 2.75
cut3=diamonds[diamonds$cut=="Ideal" & diamonds$color=="D",]
head(cut3)
## carat cut color clarity depth table price x y z
## 63 0.30 Ideal D SI1 62.5 57 552 4.29 4.32 2.69
## 64 0.30 Ideal D SI1 62.1 56 552 4.30 4.33 2.68
## 121 0.71 Ideal D SI2 62.3 56 2762 5.73 5.69 3.56
## 133 0.71 Ideal D SI1 61.9 59 2764 5.69 5.72 3.53
## 145 0.71 Ideal D SI2 61.6 55 2767 5.74 5.76 3.54
## 156 0.76 Ideal D SI2 62.4 57 2770 5.78 5.83 3.62
cut4=diamonds[diamonds$cut=="Ideal" | diamonds$color=="D",]
head(cut4)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 12 0.23 Ideal J VS1 62.8 56 340 3.93 3.90 2.46
## 14 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 17 0.30 Ideal I SI2 62.0 54 348 4.31 4.34 2.68
## 29 0.23 Very Good D VS2 60.5 61 357 3.96 3.97 2.40
## 35 0.23 Very Good D VS1 61.9 58 402 3.92 3.96 2.44
cut5=ifelse(diamonds$price>9000,"Expensive","Not So Expensive")
table(cut5)
## cut5
## Expensive Not So Expensive
## 6298 47642