getwd()
## [1] "C:/Users/Dell/Desktop"
setwd("C:\\Users\\Dell\\Downloads\\BigDiamonds.csv")
dir(pattern = 'csv')
## [1] "BigDiamonds.csv"
ls()
## character(0)
rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 361857 19.4     592000 31.7   460000 24.6
## Vcells 548259  4.2    1023718  7.9   858019  6.6
diamonds=read.csv("BigDiamonds.csv")
str(diamonds)
## 'data.frame':    598024 obs. of  13 variables:
##  $ X           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ carat       : num  0.25 0.23 0.34 0.21 0.31 0.2 0.2 0.22 0.23 0.2 ...
##  $ cut         : Factor w/ 3 levels "Good","Ideal",..: 3 1 1 3 3 1 1 3 3 1 ...
##  $ color       : Factor w/ 9 levels "D","E","F","G",..: 8 4 7 1 8 4 4 1 8 3 ...
##  $ clarity     : Factor w/ 9 levels "I1","I2","IF",..: 1 1 2 1 1 5 5 1 5 4 ...
##  $ table       : num  59 61 58 60 59 60 63 61 57.5 65 ...
##  $ depth       : num  63.7 58.1 58.7 60.6 62.2 64.4 62.6 59.2 63.6 54.9 ...
##  $ cert        : Factor w/ 9 levels "AGS","EGL","EGL Intl.",..: 6 6 6 6 2 6 6 6 8 6 ...
##  $ measurements: Factor w/ 241453 levels "","  3.99  x   3.95  x   2.44",..: 19960 21917 48457 15701 37341 14661 14400 19642 17115 16177 ...
##  $ price       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ x           : num  3.96 4 4.56 3.8 4.35 3.74 3.72 3.95 3.87 3.83 ...
##  $ y           : num  3.95 4.05 4.53 3.82 4.26 3.67 3.65 3.97 3.9 4 ...
##  $ z           : num  2.52 2.3 2.67 2.31 2.68 2.38 2.31 2.34 2.47 2.14 ...
summary(diamonds)
##        X              carat           cut             color      
##  Min.   :     1   Min.   :0.200   Good  : 59680   G      :96204  
##  1st Qu.:149507   1st Qu.:0.500   Ideal :369448   F      :93573  
##  Median :299013   Median :0.900   V.Good:168896   E      :93483  
##  Mean   :299013   Mean   :1.071                   H      :86619  
##  3rd Qu.:448518   3rd Qu.:1.500                   D      :73630  
##  Max.   :598024   Max.   :9.250                   I      :70282  
##                                                   (Other):84233  
##     clarity           table           depth               cert       
##  SI1    :116631   Min.   : 0.00   Min.   : 0.00   GIA       :463555  
##  VS2    :111082   1st Qu.:56.00   1st Qu.:61.00   IGI       : 43667  
##  SI2    :104300   Median :58.00   Median :62.10   EGL       : 33814  
##  VS1    : 97730   Mean   :57.63   Mean   :61.06   EGL USA   : 16079  
##  VVS2   : 65500   3rd Qu.:59.00   3rd Qu.:62.70   EGL Intl. : 11447  
##  VVS1   : 54798   Max.   :75.90   Max.   :81.30   EGL ISRAEL: 11301  
##  (Other): 47983                                   (Other)   : 18161  
##                  measurements        price             x         
##  0.00  x  0.00  x  0.00:   425   Min.   :  300   Min.   : 0.150  
##  0.00 x 0.00 x 0.00    :   222   1st Qu.: 1220   1st Qu.: 4.740  
##  4.3 x 4.27 x 2.67     :    97   Median : 3503   Median : 5.780  
##  4.31 x 4.29 x 2.68    :    87   Mean   : 8753   Mean   : 5.991  
##  4.29 x 4.26 x 2.67    :    86   3rd Qu.:11174   3rd Qu.: 6.970  
##  4.3 x 4.28 x 2.67     :    84   Max.   :99990   Max.   :13.890  
##  (Other)               :597023   NA's   :713     NA's   :1815    
##        y                z         
##  Min.   : 1.000   Min.   : 0.040  
##  1st Qu.: 4.970   1st Qu.: 3.120  
##  Median : 6.050   Median : 3.860  
##  Mean   : 6.199   Mean   : 4.033  
##  3rd Qu.: 7.230   3rd Qu.: 4.610  
##  Max.   :13.890   Max.   :13.180  
##  NA's   :1852     NA's   :2544
diamonds$pricepercarat=diamonds$price/diamonds$carat

diamonds2=na.omit(diamonds)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked _by_ '.GlobalEnv':
## 
##     diamonds
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
summarize(diamonds2$pricepercarat,diamonds2$color,max)
##   diamonds2$color diamonds2$pricepercarat
## 1               D                49519.40
## 2               E                40871.01
## 3               F                37084.06
## 4               G                32998.33
## 5               H                31718.95
## 6               I                24982.45
## 7               J                22890.12
## 8               K                19516.77
## 9               L                14585.41
summarize(diamonds2$pricepercarat,diamonds2$color,median)
##   diamonds2$color diamonds2$pricepercarat
## 1               D                4114.000
## 2               E                3836.000
## 3               F                4124.876
## 4               G                4314.286
## 5               H                4551.111
## 6               I                4514.706
## 7               J                4330.348
## 8               K                3851.258
## 9               L                2800.000
diamonds2=na.omit(diamonds)
library(Hmisc)
summarize(diamonds2$pricepercarat,diamonds2$clarity,min)
##   diamonds2$clarity diamonds2$pricepercarat
## 1                I1                622.2222
## 2                I2                525.0000
## 3                IF               1093.7500
## 4               SI1                735.4167
## 5               SI2                726.4706
## 6               VS1                872.8814
## 7               VS2                866.6667
## 8              VVS1               1139.0244
## 9              VVS2               1038.4615
summarize(diamonds2$pricepercarat,diamonds2$color,min)
##   diamonds2$color diamonds2$pricepercarat
## 1               D                603.8136
## 2               E                604.7809
## 3               F                671.1111
## 4               G                654.2373
## 5               H                660.8696
## 6               I                525.0000
## 7               J                630.6122
## 8               K                637.6812
## 9               L                683.3333
library(data.table)

diamonds3=data.table(diamonds2)
diamonds3[,max(pricepercarat),color]
##    color       V1
## 1:     G 32998.33
## 2:     K 19516.77
## 3:     J 22890.12
## 4:     H 31718.95
## 5:     F 37084.06
## 6:     I 24982.45
## 7:     D 49519.40
## 8:     E 40871.01
## 9:     L 14585.41
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
head(diamonds2)
##       X carat    cut color clarity table depth cert       measurements
## 494 494  0.24 V.Good     G     SI1  61.0  58.9  GIA 4.09 x 4.10 x 2.41
## 495 495  0.31 V.Good     K     SI2  59.0  60.2  GIA 4.40 x 4.42 x 2.65
## 496 496  0.26   Good     J     VS2  56.5  64.1  IGI 4.01 x 4.05 x 2.58
## 497 497  0.24  Ideal     G     SI1  55.0  61.3  GIA 4.01 x 4.03 x 2.47
## 498 498  0.30   Good     H      I1  57.0  62.2  GIA 4.21 x 4.24 x 2.63
## 499 499  0.34   Good     F      I1  66.0  55.0  GIA 4.75 x 4.61 x 2.57
##     price    x    y    z pricepercarat
## 494   300 4.09 4.10 2.41     1250.0000
## 495   300 4.40 4.42 2.65      967.7419
## 496   300 4.01 4.05 2.58     1153.8462
## 497   300 4.01 4.03 2.47     1250.0000
## 498   300 4.21 4.24 2.63     1000.0000
## 499   300 4.75 4.61 2.57      882.3529
diamonds2$X=NULL

sqldf("select max(pricepercarat),color
from
diamonds2 group by color")
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
##   max(pricepercarat) color
## 1           49519.40     D
## 2           40871.01     E
## 3           37084.06     F
## 4           32998.33     G
## 5           31718.95     H
## 6           24982.45     I
## 7           22890.12     J
## 8           19516.77     K
## 9           14585.41     L