getwd()
## [1] "C:/Users/Dell/Desktop"
setwd("C:\\Users\\Dell\\Downloads\\BigDiamonds.csv")
dir(pattern = 'csv')
## [1] "BigDiamonds.csv"
ls()
## character(0)
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 361857 19.4 592000 31.7 460000 24.6
## Vcells 548259 4.2 1023718 7.9 858019 6.6
diamonds=read.csv("BigDiamonds.csv")
str(diamonds)
## 'data.frame': 598024 obs. of 13 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ carat : num 0.25 0.23 0.34 0.21 0.31 0.2 0.2 0.22 0.23 0.2 ...
## $ cut : Factor w/ 3 levels "Good","Ideal",..: 3 1 1 3 3 1 1 3 3 1 ...
## $ color : Factor w/ 9 levels "D","E","F","G",..: 8 4 7 1 8 4 4 1 8 3 ...
## $ clarity : Factor w/ 9 levels "I1","I2","IF",..: 1 1 2 1 1 5 5 1 5 4 ...
## $ table : num 59 61 58 60 59 60 63 61 57.5 65 ...
## $ depth : num 63.7 58.1 58.7 60.6 62.2 64.4 62.6 59.2 63.6 54.9 ...
## $ cert : Factor w/ 9 levels "AGS","EGL","EGL Intl.",..: 6 6 6 6 2 6 6 6 8 6 ...
## $ measurements: Factor w/ 241453 levels ""," 3.99 x 3.95 x 2.44",..: 19960 21917 48457 15701 37341 14661 14400 19642 17115 16177 ...
## $ price : int NA NA NA NA NA NA NA NA NA NA ...
## $ x : num 3.96 4 4.56 3.8 4.35 3.74 3.72 3.95 3.87 3.83 ...
## $ y : num 3.95 4.05 4.53 3.82 4.26 3.67 3.65 3.97 3.9 4 ...
## $ z : num 2.52 2.3 2.67 2.31 2.68 2.38 2.31 2.34 2.47 2.14 ...
summary(diamonds)
## X carat cut color
## Min. : 1 Min. :0.200 Good : 59680 G :96204
## 1st Qu.:149507 1st Qu.:0.500 Ideal :369448 F :93573
## Median :299013 Median :0.900 V.Good:168896 E :93483
## Mean :299013 Mean :1.071 H :86619
## 3rd Qu.:448518 3rd Qu.:1.500 D :73630
## Max. :598024 Max. :9.250 I :70282
## (Other):84233
## clarity table depth cert
## SI1 :116631 Min. : 0.00 Min. : 0.00 GIA :463555
## VS2 :111082 1st Qu.:56.00 1st Qu.:61.00 IGI : 43667
## SI2 :104300 Median :58.00 Median :62.10 EGL : 33814
## VS1 : 97730 Mean :57.63 Mean :61.06 EGL USA : 16079
## VVS2 : 65500 3rd Qu.:59.00 3rd Qu.:62.70 EGL Intl. : 11447
## VVS1 : 54798 Max. :75.90 Max. :81.30 EGL ISRAEL: 11301
## (Other): 47983 (Other) : 18161
## measurements price x
## 0.00 x 0.00 x 0.00: 425 Min. : 300 Min. : 0.150
## 0.00 x 0.00 x 0.00 : 222 1st Qu.: 1220 1st Qu.: 4.740
## 4.3 x 4.27 x 2.67 : 97 Median : 3503 Median : 5.780
## 4.31 x 4.29 x 2.68 : 87 Mean : 8753 Mean : 5.991
## 4.29 x 4.26 x 2.67 : 86 3rd Qu.:11174 3rd Qu.: 6.970
## 4.3 x 4.28 x 2.67 : 84 Max. :99990 Max. :13.890
## (Other) :597023 NA's :713 NA's :1815
## y z
## Min. : 1.000 Min. : 0.040
## 1st Qu.: 4.970 1st Qu.: 3.120
## Median : 6.050 Median : 3.860
## Mean : 6.199 Mean : 4.033
## 3rd Qu.: 7.230 3rd Qu.: 4.610
## Max. :13.890 Max. :13.180
## NA's :1852 NA's :2544
diamonds$pricepercarat=diamonds$price/diamonds$carat
diamonds2=na.omit(diamonds)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked _by_ '.GlobalEnv':
##
## diamonds
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
summarize(diamonds2$pricepercarat,diamonds2$color,max)
## diamonds2$color diamonds2$pricepercarat
## 1 D 49519.40
## 2 E 40871.01
## 3 F 37084.06
## 4 G 32998.33
## 5 H 31718.95
## 6 I 24982.45
## 7 J 22890.12
## 8 K 19516.77
## 9 L 14585.41
summarize(diamonds2$pricepercarat,diamonds2$color,median)
## diamonds2$color diamonds2$pricepercarat
## 1 D 4114.000
## 2 E 3836.000
## 3 F 4124.876
## 4 G 4314.286
## 5 H 4551.111
## 6 I 4514.706
## 7 J 4330.348
## 8 K 3851.258
## 9 L 2800.000
diamonds2=na.omit(diamonds)
library(Hmisc)
summarize(diamonds2$pricepercarat,diamonds2$clarity,min)
## diamonds2$clarity diamonds2$pricepercarat
## 1 I1 622.2222
## 2 I2 525.0000
## 3 IF 1093.7500
## 4 SI1 735.4167
## 5 SI2 726.4706
## 6 VS1 872.8814
## 7 VS2 866.6667
## 8 VVS1 1139.0244
## 9 VVS2 1038.4615
summarize(diamonds2$pricepercarat,diamonds2$color,min)
## diamonds2$color diamonds2$pricepercarat
## 1 D 603.8136
## 2 E 604.7809
## 3 F 671.1111
## 4 G 654.2373
## 5 H 660.8696
## 6 I 525.0000
## 7 J 630.6122
## 8 K 637.6812
## 9 L 683.3333
library(data.table)
diamonds3=data.table(diamonds2)
diamonds3[,max(pricepercarat),color]
## color V1
## 1: G 32998.33
## 2: K 19516.77
## 3: J 22890.12
## 4: H 31718.95
## 5: F 37084.06
## 6: I 24982.45
## 7: D 49519.40
## 8: E 40871.01
## 9: L 14585.41
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
head(diamonds2)
## X carat cut color clarity table depth cert measurements
## 494 494 0.24 V.Good G SI1 61.0 58.9 GIA 4.09 x 4.10 x 2.41
## 495 495 0.31 V.Good K SI2 59.0 60.2 GIA 4.40 x 4.42 x 2.65
## 496 496 0.26 Good J VS2 56.5 64.1 IGI 4.01 x 4.05 x 2.58
## 497 497 0.24 Ideal G SI1 55.0 61.3 GIA 4.01 x 4.03 x 2.47
## 498 498 0.30 Good H I1 57.0 62.2 GIA 4.21 x 4.24 x 2.63
## 499 499 0.34 Good F I1 66.0 55.0 GIA 4.75 x 4.61 x 2.57
## price x y z pricepercarat
## 494 300 4.09 4.10 2.41 1250.0000
## 495 300 4.40 4.42 2.65 967.7419
## 496 300 4.01 4.05 2.58 1153.8462
## 497 300 4.01 4.03 2.47 1250.0000
## 498 300 4.21 4.24 2.63 1000.0000
## 499 300 4.75 4.61 2.57 882.3529
diamonds2$X=NULL
sqldf("select max(pricepercarat),color
from
diamonds2 group by color")
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
## max(pricepercarat) color
## 1 49519.40 D
## 2 40871.01 E
## 3 37084.06 F
## 4 32998.33 G
## 5 31718.95 H
## 6 24982.45 I
## 7 22890.12 J
## 8 19516.77 K
## 9 14585.41 L