ls()
## character(0)
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 294441 7.9 592000 15.9 460000 12.3
## Vcells 331836 2.6 786432 6.0 677529 5.2
memory.size()
## [1] 16.97
memory.limit()
## [1] 1535
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
library(MASS)
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
?Boston
## starting httpd help server ... done
table(Boston$rm)
##
## 3.561 3.863 4.138 4.368 4.519 4.628 4.652 4.88 4.903 4.906 4.926 4.963
## 1 1 2 1 1 1 1 1 1 1 1 1
## 4.97 4.973 5 5.012 5.019 5.036 5.093 5.155 5.186 5.272 5.277 5.304
## 1 1 1 1 1 1 1 1 1 1 1 2
## 5.344 5.349 5.362 5.39 5.399 5.403 5.404 5.412 5.414 5.427 5.453 5.454
## 1 1 1 2 1 1 2 1 1 1 1 1
## 5.456 5.468 5.52 5.531 5.536 5.56 5.565 5.569 5.57 5.572 5.593 5.594
## 1 1 1 1 1 1 1 1 1 1 1 1
## 5.597 5.599 5.602 5.604 5.605 5.608 5.613 5.617 5.627 5.628 5.631 5.637
## 1 1 1 1 1 1 1 1 1 1 1 1
## 5.648 5.663 5.67 5.682 5.683 5.693 5.701 5.705 5.706 5.707 5.708 5.709
## 1 1 1 1 1 1 1 1 1 1 1 1
## 5.713 5.727 5.731 5.741 5.747 5.757 5.759 5.762 5.782 5.783 5.786 5.787
## 3 1 1 1 1 2 1 1 1 1 1 1
## 5.79 5.794 5.803 5.807 5.813 5.818 5.822 5.834 5.836 5.837 5.841 5.85
## 1 1 1 1 2 1 1 1 1 1 1 1
## 5.851 5.852 5.854 5.856 5.857 5.859 5.868 5.869 5.87 5.871 5.872 5.874
## 1 1 2 2 1 1 1 1 1 1 1 1
## 5.875 5.876 5.877 5.878 5.879 5.88 5.884 5.885 5.887 5.888 5.889 5.891
## 2 1 1 1 1 1 1 1 1 2 1 1
## 5.895 5.896 5.898 5.905 5.913 5.914 5.92 5.924 5.926 5.927 5.928 5.933
## 1 1 1 1 1 1 1 1 2 1 1 1
## 5.935 5.936 5.942 5.949 5.95 5.951 5.952 5.957 5.96 5.961 5.963 5.965
## 2 2 1 1 1 1 1 1 1 2 1 1
## 5.966 5.968 5.972 5.976 5.981 5.983 5.985 5.986 5.987 5.99 5.998 6.003
## 2 1 1 1 1 2 1 1 1 1 1 1
## 6.004 6.006 6.009 6.012 6.014 6.015 6.019 6.02 6.021 6.023 6.027 6.03
## 2 1 2 1 1 1 1 1 1 1 1 2
## 6.031 6.037 6.041 6.047 6.051 6.059 6.064 6.065 6.066 6.069 6.072 6.081
## 1 1 1 1 1 1 1 1 1 1 1 1
## 6.083 6.086 6.092 6.095 6.096 6.101 6.103 6.108 6.112 6.113 6.114 6.115
## 1 1 1 1 2 1 1 2 1 1 1 1
## 6.12 6.121 6.122 6.127 6.129 6.13 6.137 6.14 6.142 6.144 6.145 6.151
## 1 1 2 3 1 1 1 1 1 2 1 1
## 6.152 6.153 6.162 6.163 6.164 6.167 6.169 6.172 6.174 6.176 6.182 6.185
## 2 1 2 1 1 3 1 1 1 1 1 2
## 6.193 6.195 6.202 6.208 6.209 6.211 6.212 6.216 6.219 6.223 6.226 6.229
## 2 1 1 1 2 2 1 1 1 1 1 3
## 6.23 6.232 6.24 6.242 6.245 6.249 6.25 6.251 6.254 6.266 6.273 6.279
## 1 1 1 1 1 1 1 2 1 1 1 1
## 6.286 6.29 6.297 6.301 6.302 6.31 6.312 6.315 6.316 6.317 6.319 6.326
## 1 1 1 1 1 1 2 2 1 1 1 2
## 6.333 6.335 6.341 6.343 6.345 6.348 6.358 6.372 6.373 6.375 6.376 6.377
## 1 1 1 1 1 1 1 1 1 1 2 1
## 6.38 6.382 6.383 6.389 6.393 6.395 6.398 6.402 6.404 6.405 6.406 6.411
## 2 1 1 1 1 1 1 1 1 3 1 1
## 6.415 6.416 6.417 6.421 6.425 6.426 6.43 6.431 6.433 6.434 6.436 6.437
## 1 1 3 1 1 1 1 2 1 1 1 1
## 6.438 6.442 6.453 6.454 6.456 6.458 6.459 6.461 6.471 6.474 6.481 6.482
## 1 1 1 1 1 1 1 1 1 1 1 1
## 6.484 6.485 6.487 6.49 6.495 6.51 6.511 6.513 6.516 6.525 6.538 6.54
## 1 1 1 1 2 1 1 1 1 1 1 1
## 6.545 6.546 6.549 6.552 6.556 6.563 6.565 6.567 6.575 6.579 6.59 6.593
## 1 1 1 1 1 1 1 1 1 1 1 1
## 6.595 6.604 6.606 6.616 6.618 6.619 6.625 6.629 6.63 6.631 6.635 6.642
## 1 1 1 1 1 1 1 1 2 1 2 1
## 6.649 6.655 6.657 6.674 6.678 6.683 6.696 6.701 6.715 6.718 6.726 6.727
## 1 1 1 1 1 1 1 1 1 1 1 2
## 6.728 6.739 6.749 6.75 6.758 6.762 6.77 6.781 6.782 6.794 6.8 6.812
## 2 1 1 1 1 1 1 1 2 2 1 1
## 6.816 6.824 6.826 6.833 6.842 6.849 6.852 6.854 6.86 6.861 6.871 6.874
## 1 1 1 1 1 1 1 1 1 1 1 1
## 6.879 6.897 6.939 6.943 6.951 6.957 6.968 6.975 6.976 6.98 6.982 6.998
## 1 1 1 1 2 1 2 1 1 2 1 1
## 7.007 7.014 7.016 7.024 7.041 7.061 7.079 7.088 7.104 7.107 7.135 7.147
## 1 1 1 1 1 1 1 1 1 1 1 1
## 7.148 7.155 7.163 7.178 7.185 7.203 7.206 7.236 7.241 7.249 7.267 7.274
## 1 1 1 1 2 1 1 1 1 1 1 1
## 7.287 7.313 7.327 7.333 7.358 7.393 7.412 7.416 7.42 7.454 7.47 7.489
## 1 1 1 1 1 1 1 1 1 1 1 1
## 7.52 7.61 7.645 7.686 7.691 7.765 7.802 7.82 7.831 7.853 7.875 7.923
## 1 1 1 1 1 1 1 2 1 1 1 1
## 7.929 8.034 8.04 8.069 8.247 8.259 8.266 8.297 8.337 8.375 8.398 8.704
## 1 1 1 1 1 1 1 1 1 1 1 1
## 8.725 8.78
## 1 1
boxplot(Boston$rm)
attach(Boston)
mean(medv)
## [1] 22.53281
summary(medv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 17.02 21.20 22.53 25.00 50.00
summary(rm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.561 5.886 6.208 6.285 6.624 8.780
sqldf("select medv,rm from Boston where rm>7")
## Loading required package: tcltk
## medv rm
## 1 34.7 7.185
## 2 36.2 7.147
## 3 34.9 7.024
## 4 35.4 7.249
## 5 33.0 7.104
## 6 23.6 7.007
## 7 28.7 7.079
## 8 38.7 8.069
## 9 43.8 7.820
## 10 33.2 7.416
## 11 50.0 7.489
## 12 50.0 7.802
## 13 50.0 8.375
## 14 50.0 7.929
## 15 39.8 7.765
## 16 37.9 7.155
## 17 50.0 7.831
## 18 34.9 7.185
## 19 36.4 7.178
## 20 50.0 7.875
## 21 33.3 7.287
## 22 30.3 7.107
## 23 34.6 7.274
## 24 32.9 7.135
## 25 42.3 7.610
## 26 48.5 7.853
## 27 50.0 8.034
## 28 44.8 8.266
## 29 50.0 8.725
## 30 37.6 8.040
## 31 31.6 7.163
## 32 46.7 7.686
## 33 31.7 7.412
## 34 41.7 8.337
## 35 48.3 8.247
## 36 31.5 7.358
## 37 42.8 8.259
## 38 44.0 7.454
## 39 50.0 8.704
## 40 36.0 7.333
## 41 33.8 7.203
## 42 43.1 7.520
## 43 48.8 8.398
## 44 31.0 7.327
## 45 36.5 7.206
## 46 30.7 7.014
## 47 50.0 8.297
## 48 43.5 7.470
## 49 35.2 7.691
## 50 33.2 7.267
## 51 45.4 7.820
## 52 46.0 7.645
## 53 50.0 7.923
## 54 32.2 7.088
## 55 37.3 7.148
## 56 29.0 7.041
## 57 36.1 7.236
## 58 33.4 7.420
## 59 32.7 7.241
## 60 21.9 8.780
## 61 50.0 7.016
## 62 15.0 7.313
## 63 17.8 7.393
## 64 25.0 7.061
sqldf("select avg(medv),
avg(rm),
chas
from
Boston
where rm>7
group by
chas")
## avg(medv) avg(rm) chas
## 1 37.87679 7.535482 0
## 2 42.03750 7.812375 1
#CHARACHTER TO NUMERIC
money=c(50000,"50000","$50000","50,000","$50,000")
#Giving average of money using R code
mean(money)
## Warning in mean.default(money): argument is not numeric or logical:
## returning NA
## [1] NA
str(money)
## chr [1:5] "50000" "50000" "$50000" "50,000" "$50,000"
money2=as.numeric(money)
## Warning: NAs introduced by coercion
money2
## [1] 50000 50000 NA NA NA
money2=gsub(",","",money)
money3=gsub("\\$","",money2)
money3
## [1] "50000" "50000" "50000" "50000" "50000"
money3=as.numeric(money3)
mean(money3)
## [1] 50000
#CHARACHTER TO DATES
dobofclass=c("1April2007",
"28th july 1984",
"05 May 1988",
"29nov-2008")
strptime("29nov-2008","%d%b-%Y")
## [1] "2008-11-29 IST"
strptime("05 May 1988","%d%b-%Y")
## [1] NA
strptime("05 May 1988","%d %B %Y")
## [1] "1988-05-05 IST"
library(lubridate)
dmy(dobofclass)
## [1] "2007-04-01 UTC" "1984-07-28 UTC" "1988-05-05 UTC" "2008-11-29 UTC"
Sys.Date()
## [1] "2015-12-12"
#Differences in dates is given by difftime
difftime(Sys.Date(),dmy(dobofclass))
## Time differences in days
## [1] 3177 11459 10082 2569
?strptime
#Converting to Charachter
x=c(23,56,78,89)
as.character(x)
## [1] "23" "56" "78" "89"
paste(x)
## [1] "23" "56" "78" "89"
paste("ajay",dobofclass[1])
## [1] "ajay 1April2007"
paste("student1",dobofclass[2])
## [1] "student1 28th july 1984"
substr("ajay",2,3)
## [1] "ja"
namclass=c("Ajay","Ajith","Sudeeptha","Yogisha")
#give me first initial of every memmber of nclass i.e.A,A,S,Y
substr(namclass,1,1)
## [1] "A" "A" "S" "Y"
nchar(namclass)
## [1] 4 5 9 7
#give me last initial of every member of nclass
substr(namclass,nchar(namclass),nchar(namclass))#MODIFY THIS
## [1] "y" "h" "a" "a"
data("mtcars")
library(sqldf)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
#Give me average mpg for every carb and every cyl
sqldf("select avg(mpg) from mtcars ")
## avg(mpg)
## 1 20.09062
sqldf("select avg(mpg),cyl from mtcars group by cyl")
## avg(mpg) cyl
## 1 26.66364 4
## 2 19.74286 6
## 3 15.10000 8
sqldf("select avg(mpg),cyl,gear from mtcars group by cyl,gear")
## avg(mpg) cyl gear
## 1 21.500 4 3
## 2 26.925 4 4
## 3 28.200 4 5
## 4 19.750 6 3
## 5 19.750 6 4
## 6 19.700 6 5
## 7 15.050 8 3
## 8 15.400 8 5
getwd()
## [1] "C:/Users/dell/Desktop/Teaching"
dir("C:/Users/dell/Desktop/",pattern=".csv")
## [1] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [2] "BigDiamonds.csv"
## [3] "BigDiamonds.csv.zip"
## [4] "Boston.csv"
## [5] "ccFraud.csv"
## [6] "test.csv"
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, mday, month, quarter, wday, week, yday, year
diamonds=fread("C:/Users/dell/Desktop/BigDiamonds.csv")
##
Read 1.7% of 598024 rows
Read 16.7% of 598024 rows
Read 38.5% of 598024 rows
Read 53.5% of 598024 rows
Read 63.5% of 598024 rows
Read 68.6% of 598024 rows
Read 93.6% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:12
tables()
## NAME NROW NCOL MB
## [1,] diamonds 598,024 13 75
## COLS
## [1,] V1,carat,cut,color,clarity,table,depth,cert,measurements,price,x,y,z
## KEY
## [1,]
## Total: 75MB
library("sqldf")
sqldf("select avg(price) from diamonds")#1
## avg(price)
## 1 8753.018
diamonds[,mean(price,na.rm=T),]
## [1] 8753.018
sqldf("select avg(carat) from diamonds")#2
## avg(carat)
## 1 1.071297
##DT[I Select ,J Operators,K Group By]
diamonds[,mean(carat,na.rm=T),]
## [1] 1.071297
sqldf("select
avg(carat),color
from diamonds
group by color")#3
## avg(carat) color
## 1 0.8266182 D
## 2 0.8318824 E
## 3 0.9410532 F
## 4 1.0638408 G
## 5 1.2099407 H
## 6 1.2712823 I
## 7 1.3475399 J
## 8 1.4950646 K
## 9 1.3632705 L
diamonds[,mean(carat,na.rm=T),color]
## color V1
## 1: K 1.4950646
## 2: G 1.0638408
## 3: J 1.3475399
## 4: D 0.8266182
## 5: F 0.9410532
## 6: E 0.8318824
## 7: H 1.2099407
## 8: I 1.2712823
## 9: L 1.3632705
sqldf("select
avg(carat),cut
from diamonds group by cut")#4
## avg(carat) cut
## 1 0.9003031 Good
## 2 1.1201936 Ideal
## 3 1.0247597 V.Good
diamonds[,mean(carat),cut]
## cut V1
## 1: V.Good 1.0247597
## 2: Good 0.9003031
## 3: Ideal 1.1201936
diamonds[carat>3,
.(mean(carat),mean(price,na.rm=T)),
cut]
## cut V1 V2
## 1: Good 3.675146 43480.94
## 2: V.Good 3.616123 45957.94
## 3: Ideal 3.548771 49433.78
# What is average price for
diamonds[carat>4 & color=="K",mean(price,na.rm=T),]
## [1] 55531.86
diamonds[carat>4 ,mean(price,na.rm=T),color]
## color V1
## 1: D 55053.01
## 2: E 55229.97
## 3: G 58614.04
## 4: K 55531.86
## 5: J 60725.33
## 6: I 65464.33
## 7: H 62630.72
## 8: F 59957.27
## 9: L 49344.77
diamonds[carat<3 & color=="J",mean(price,na.rm=T),]
## [1] 5767.972
diamonds[carat<3 ,mean(price,na.rm=T),color]
## color V1
## 1: K 5721.485
## 2: G 7129.863
## 3: J 5767.972
## 4: D 7794.984
## 5: F 6931.228
## 6: E 6580.012
## 7: H 7291.754
## 8: I 6809.520
## 9: L 3932.475
diamonds[carat<4.5 & color=="K" & cut=="Ideal",mean(price,na.rm=T),]
## [1] 10046.97
diamonds[carat<4.5 ,mean(price,na.rm=T),.(color,cut)]
## color cut V1
## 1: K V.Good 7598.953
## 2: G Good 5611.800
## 3: J Good 5038.752
## 4: D V.Good 6364.563
## 5: F Good 4919.152
## 6: F V.Good 6908.591
## 7: G V.Good 7866.847
## 8: J V.Good 7281.392
## 9: E Good 4156.141
## 10: D Good 4644.282
## 11: E Ideal 8297.600
## 12: E V.Good 6210.817
## 13: H V.Good 7859.716
## 14: F Ideal 9183.598
## 15: H Good 5839.827
## 16: I Good 5096.376
## 17: K Good 4935.032
## 18: I V.Good 7493.896
## 19: L Good 4138.479
## 20: I Ideal 10361.547
## 21: G Ideal 9653.360
## 22: D Ideal 9605.492
## 23: J Ideal 9676.903
## 24: H Ideal 10870.121
## 25: L V.Good 5259.427
## 26: K Ideal 10046.967
## 27: L Ideal 7059.294
## color cut V1
diamonds[price>3000 ,mean(price,na.rm=T),.(color,cut)]
## color cut V1
## 1: G V.Good 13912.689
## 2: F Ideal 16560.190
## 3: J Ideal 15106.234
## 4: G Ideal 16127.682
## 5: E Good 9562.663
## 6: I Good 10568.739
## 7: I V.Good 13373.244
## 8: E V.Good 12512.359
## 9: H V.Good 13546.145
## 10: G Good 11217.941
## 11: F V.Good 13153.974
## 12: E Ideal 16549.568
## 13: I Ideal 15601.083
## 14: H Ideal 16766.347
## 15: D Ideal 17991.784
## 16: D V.Good 13063.586
## 17: J Good 10946.462
## 18: K Ideal 15217.389
## 19: F Good 11114.473
## 20: H Good 11490.488
## 21: K V.Good 15361.029
## 22: J V.Good 14305.686
## 23: D Good 10355.956
## 24: L Ideal 12607.748
## 25: L V.Good 13453.077
## 26: K Good 12485.608
## 27: L Good 13843.234
## color cut V1
diamonds[price>3000 & color=="I" & cut=="V.Good",mean(price,na.rm=T),]
## [1] 13373.24
library(Hmisc)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
##
## The following object is masked _by_ '.GlobalEnv':
##
## diamonds
##
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units

summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 300 1220 3503 8753 11170 99990 713
describe(diamonds$price)
## diamonds$price
## n missing unique Info Mean .05 .10 .25 .50
## 597311 713 40312 1 8753 612 736 1220 3503
## .75 .90 .95
## 11174 22209 33665
##
## lowest : 300 301 302 303 304
## highest: 99930 99942 99960 99966 99990
diamonds2=na.omit(diamonds)
summarize(diamonds2$price,llist(diamonds2$cut,diamonds2$color),max)
## diamonds2$cut diamonds2$color diamonds2$price
## 1 Good D 97164
## 2 Good E 99920
## 3 Good F 99630
## 4 Good G 96536
## 5 Good H 99359
## 6 Good I 93780
## 7 Good J 99806
## 8 Good K 96073
## 9 Good L 68793
## 10 Ideal D 99920
## 11 Ideal E 99930
## 12 Ideal F 99960
## 13 Ideal G 99930
## 14 Ideal H 99990
## 15 Ideal I 99910
## 16 Ideal J 99630
## 17 Ideal K 99690
## 18 Ideal L 95814
## 19 V.Good D 99870
## 20 V.Good E 99966
## 21 V.Good F 99890
## 22 V.Good G 99472
## 23 V.Good H 99810
## 24 V.Good I 99942
## 25 V.Good J 98273
## 26 V.Good K 98934
## 27 V.Good L 95253
#install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:Hmisc':
##
## combine, src, summarize
##
## The following objects are masked from 'package:data.table':
##
## between, last
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following object is masked from 'package:MASS':
##
## select
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
diamonds3=select(diamonds,price,carat,cut,color)
diamonds3
## price carat cut color
## 1: NA 0.25 V.Good K
## 2: NA 0.23 Good G
## 3: NA 0.34 Good J
## 4: NA 0.21 V.Good D
## 5: NA 0.31 V.Good K
## ---
## 598020: 99930 3.02 Ideal E
## 598021: 99942 5.01 V.Good I
## 598022: 99960 3.43 Ideal F
## 598023: 99966 3.01 V.Good E
## 598024: 99990 4.13 Ideal H
diamonds4=transmute(prbycarat=price/carat,diamonds3)
diamonds4
## prbycarat
## 1: NA
## 2: NA
## 3: NA
## 4: NA
## 5: NA
## ---
## 598020: 33089.40
## 598021: 19948.50
## 598022: 29142.86
## 598023: 33211.30
## 598024: 24210.65
sample_n(diamonds,600,F)
## V1 carat cut color clarity table depth cert
## 1: 194441 0.59 Ideal J IF 57 61.9 GIA
## 2: 162462 0.60 Good F SI2 58 64.7 GIA
## 3: 99624 0.31 Ideal F VVS2 57 62.6 GIA
## 4: 379140 1.20 Ideal F VS2 57 62.7 EGL Intl.
## 5: 184421 0.60 V.Good J VS1 57 63.2 GIA
## ---
## 596: 554867 1.52 Ideal G VVS1 57 62.2 GIA
## 597: 556055 3.08 Ideal J SI2 57 62.5 EGL
## 598: 593859 4.67 Good F SI2 60 59.2 GIA
## 599: 16597 0.35 Ideal K SI1 57 62.4 GIA
## 600: 392909 0.90 Ideal G VS1 57 62.5 GIA
## measurements price x y z
## 1: 5.38 x 5.36 x 3.33 1681 5.38 5.36 3.33
## 2: 5.26 x 5.27 x 3.41 1333 5.26 5.27 3.41
## 3: 4.33 x 4.31 x 2.71 890 4.33 4.31 2.71
## 4: 6.74 x 6.78 x 4.24 6110 6.74 6.78 4.24
## 5: 5.37 x 5.29 x 3.37 1566 5.37 5.29 3.37
## ---
## 596: 7.36 x 7.33 x 4.57 18878 7.36 7.33 4.57
## 597: 9.28 x 9.24 x 5.78 19210 9.28 9.24 5.78
## 598: 10.69 x 10.87 x 6.40 63272 10.69 10.87 6.40
## 599: 4.53 x 4.51 x 2.82 540 4.53 4.51 2.82
## 600: 6.17 x 6.12 x 3.84 6767 6.17 6.12 3.84
sample_frac(diamonds,0.01)
## V1 carat cut color clarity table depth cert
## 1: 577674 2.02 Ideal G VS1 57 62.3 GIA
## 2: 230810 0.60 V.Good D SI1 57 63.8 GIA
## 3: 33973 0.23 Ideal E VVS1 57 61.2 GIA
## 4: 221946 0.70 Ideal F SI2 57 62.1 IGI
## 5: 367651 1.41 V.Good H SI1 59 63.5 EGL
## ---
## 5976: 94131 0.37 Ideal E VS2 59 60.4 GIA
## 5977: 431896 1.60 Ideal H VS1 57 62.4 EGL
## 5978: 7494 0.33 V.Good F I1 57 62.8 GIA
## 5979: 373729 0.90 V.Good F VVS2 56 62.8 GIA
## 5980: 498128 2.01 V.Good H VS1 56 64.3 GIA
## measurements price x y z
## 1: 8.07 x 8.12 x 5.04 30109 8.07 8.12 5.04
## 2: 5.33 x 5.31 x 3.39 2130 5.33 5.31 3.39
## 3: 3.96 x 3.98 x 2.43 630 3.96 3.98 2.43
## 4: 5.66 x 5.69 x 3.53 2020 5.66 5.69 3.53
## 5: 7.03 x 6.99 x 4.45 5620 7.03 6.99 4.45
## ---
## 5976: 4.64 x 4.66 x 2.81 866 4.64 4.66 2.81
## 5977: 4.66 x 7.46 x 7.48 9424 4.66 7.46 7.48
## 5978: 4.42 x 4.44 x 2.78 470 4.42 4.44 2.78
## 5979: 6.15 x 6.11 x 3.85 5879 6.15 6.11 3.85
## 5980: 7.89 x 7.96 x 5.09 22896 7.89 7.96 5.09
#vignette("introduction",package="dplyr")
#using vignette and summarize
#do the following in dplyr
dcolor <- group_by(diamonds, color)
summarise(dcolor,mean(price,na.rm=T))
## Source: local data table [9 x 2]
##
## color mean(price, na.rm = T)
## 1 K 9694.257
## 2 G 8984.200
## 3 J 9423.581
## 4 D 8266.346
## 5 F 8234.730
## 6 E 7282.990
## 7 H 9941.795
## 8 I 9541.319
## 9 L 7109.228
diamondstest=filter(diamonds,carat>4)
dcolor <- group_by(diamondstest, color)
summarise(dcolor,mean(price,na.rm=T))
## Source: local data table [9 x 2]
##
## color mean(price, na.rm = T)
## 1 D 55053.01
## 2 E 55229.97
## 3 G 58614.04
## 4 K 55531.86
## 5 J 60725.33
## 6 I 65464.33
## 7 H 62630.72
## 8 F 59957.27
## 9 L 49344.77
diamondstest2=filter(diamonds,carat<3 & color=="J")
dcolor2 <- group_by(diamondstest2, color)
summarise(dcolor2,mean(price,na.rm=T))
## Source: local data table [1 x 2]
##
## color mean(price, na.rm = T)
## 1 J 5767.972
diamondstest3=filter(diamonds,carat<4.5 & color=="K" & cut=="Ideal")
dcolor3 = group_by(diamondstest3,color)
summarise(dcolor3,mean(price,na.rm=T))
## Source: local data table [1 x 2]
##
## color mean(price, na.rm = T)
## 1 K 10046.97
diamondstest4=filter(diamonds,carat<3 & color=="J" & cut =="Ideal")
dcut2=group_by(diamondstest4,cut)
summarise(dcut2, mean(price, na.rm = T))
## Source: local data table [1 x 2]
##
## cut mean(price, na.rm = T)
## 1 Ideal 6639.536
diamondstest4=filter(diamonds,
price>3000 &
color=="I" &
cut=="V.Good")
dcut2=group_by(diamondstest4,cut)
summarise(dcut2, mean(price, na.rm = T))
## Source: local data table [1 x 2]
##
## cut mean(price, na.rm = T)
## 1 V.Good 13373.24
# What is average price for
diamonds[price>3000 ,mean(price,na.rm=T),.(color,cut)]
## color cut V1
## 1: G V.Good 13912.689
## 2: F Ideal 16560.190
## 3: J Ideal 15106.234
## 4: G Ideal 16127.682
## 5: E Good 9562.663
## 6: I Good 10568.739
## 7: I V.Good 13373.244
## 8: E V.Good 12512.359
## 9: H V.Good 13546.145
## 10: G Good 11217.941
## 11: F V.Good 13153.974
## 12: E Ideal 16549.568
## 13: I Ideal 15601.083
## 14: H Ideal 16766.347
## 15: D Ideal 17991.784
## 16: D V.Good 13063.586
## 17: J Good 10946.462
## 18: K Ideal 15217.389
## 19: F Good 11114.473
## 20: H Good 11490.488
## 21: K V.Good 15361.029
## 22: J V.Good 14305.686
## 23: D Good 10355.956
## 24: L Ideal 12607.748
## 25: L V.Good 13453.077
## 26: K Good 12485.608
## 27: L Good 13843.234
## color cut V1
diamonds[price>3000 &
color=="I" &
cut=="V.Good",
mean(price,na.rm=T),
]
## [1] 13373.24