# Intro to R  (Week 3 General Track)
# Download data from http://bitly.com/dsdata
# Unzip the file to get csv file
# Read the file in R.
# 
# Answer the following questions and submit R code also
# Which color has maximum price/carat
# Which clarity has minimum price/carat
# Which cut has more missing values
# Which color  has minimum median price 
# What conclusion can you draw from the variables color, cut  price  and carat


ls()
## character(0)
rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 361790 19.4     592000 31.7   460000 24.6
## Vcells 548493  4.2    1023718  7.9   859315  6.6
memory.size()
## [1] 30.07
memory.limit()
## [1] 8096
getwd()
## [1] "C:/Users/Dell/Dropbox"
setwd("C:/Users/Dell/Desktop")
dir()
##  [1] "~$thonajay.docx"                                       
##  [2] "BigDiamonds.csv"                                       
##  [3] "Class-3-Public-Primary-Certification-Authority.pem.txt"
##  [4] "Data Analysis (1)"                                     
##  [5] "DataWrangling.pdf"                                     
##  [6] "desktop.ini"                                           
##  [7] "dump"                                                  
##  [8] "kushal.jpg"                                            
##  [9] "Pythonajay.docx"                                       
## [10] "SUINV.png"
dir(pattern = "csv")
## [1] "BigDiamonds.csv"
library(data.table)
diamonds=fread("BigDiamonds.csv")
## 
Read 13.4% of 598024 rows
Read 21.7% of 598024 rows
Read 33.4% of 598024 rows
Read 45.1% of 598024 rows
Read 58.5% of 598024 rows
Read 66.9% of 598024 rows
Read 76.9% of 598024 rows
Read 85.3% of 598024 rows
Read 93.6% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:13
diamonds$ppc=diamonds$price/diamonds$carat
head(diamonds)
##    V1 carat    cut color clarity table depth cert       measurements price
## 1:  1  0.25 V.Good     K      I1    59  63.7  GIA 3.96 x 3.95 x 2.52    NA
## 2:  2  0.23   Good     G      I1    61  58.1  GIA 4.00 x 4.05 x 2.30    NA
## 3:  3  0.34   Good     J      I2    58  58.7  GIA 4.56 x 4.53 x 2.67    NA
## 4:  4  0.21 V.Good     D      I1    60  60.6  GIA 3.80 x 3.82 x 2.31    NA
## 5:  5  0.31 V.Good     K      I1    59  62.2  EGL 4.35 x 4.26 x 2.68    NA
## 6:  6  0.20   Good     G     SI2    60  64.4  GIA 3.74 x 3.67 x 2.38    NA
##       x    y    z ppc
## 1: 3.96 3.95 2.52  NA
## 2: 4.00 4.05 2.30  NA
## 3: 4.56 4.53 2.67  NA
## 4: 3.80 3.82 2.31  NA
## 5: 4.35 4.26 2.68  NA
## 6: 3.74 3.67 2.38  NA
#598024 --593784
diamonds=na.omit(diamonds)

diamonds[,max(ppc),color]
##    color       V1
## 1:     G 32998.33
## 2:     K 19516.77
## 3:     J 22890.12
## 4:     H 31718.95
## 5:     F 37084.06
## 6:     I 24982.45
## 7:     D 49519.40
## 8:     E 40871.01
## 9:     L 14585.41
diamonds[,min(ppc),clarity]
##    clarity        V1
## 1:     SI1  735.4167
## 2:     SI2  726.4706
## 3:     VS2  866.6667
## 4:      I1  622.2222
## 5:    VVS2 1038.4615
## 6:     VS1  872.8814
## 7:    VVS1 1139.0244
## 8:      I2  525.0000
## 9:      IF 1093.7500
diamonds2=fread("BigDiamonds.csv")
## 
Read 15.0% of 598024 rows
Read 25.1% of 598024 rows
Read 35.1% of 598024 rows
Read 45.1% of 598024 rows
Read 55.2% of 598024 rows
Read 65.2% of 598024 rows
Read 76.9% of 598024 rows
Read 90.3% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:11
table(is.na(diamonds2$cut))
## 
##  FALSE 
## 598024
table(is.na(diamonds2$cert))
## 
##  FALSE 
## 598024
diamonds[,median(ppc),color]
##    color       V1
## 1:     G 4314.286
## 2:     K 3851.258
## 3:     J 4330.348
## 4:     H 4551.111
## 5:     F 4124.876
## 6:     I 4514.706
## 7:     D 4114.000
## 8:     E 3836.000
## 9:     L 2800.000
str(diamonds)
## Classes 'data.table' and 'data.frame':   593784 obs. of  14 variables:
##  $ V1          : chr  "494" "495" "496" "497" ...
##  $ carat       : num  0.24 0.31 0.26 0.24 0.3 0.34 0.2 0.29 0.22 0.25 ...
##  $ cut         : chr  "V.Good" "V.Good" "Good" "Ideal" ...
##  $ color       : chr  "G" "K" "J" "G" ...
##  $ clarity     : chr  "SI1" "SI2" "VS2" "SI1" ...
##  $ table       : num  61 59 56.5 55 57 66 62 58 62 64 ...
##  $ depth       : num  58.9 60.2 64.1 61.3 62.2 55 59.1 61.4 59.6 60.5 ...
##  $ cert        : chr  "GIA" "GIA" "IGI" "GIA" ...
##  $ measurements: chr  "4.09 x 4.10 x 2.41" "4.40 x 4.42 x 2.65" "4.01 x 4.05 x 2.58" "4.01 x 4.03 x 2.47" ...
##  $ price       : int  300 300 300 300 300 300 301 301 301 301 ...
##  $ x           : num  4.09 4.4 4.01 4.01 4.21 4.75 3.79 4.25 3.9 4.02 ...
##  $ y           : num  4.1 4.42 4.05 4.03 4.24 4.61 3.82 4.31 3.93 4.06 ...
##  $ z           : num  2.41 2.65 2.58 2.47 2.63 2.57 2.25 2.63 2.33 2.44 ...
##  $ ppc         : num  1250 968 1154 1250 1000 ...
##  - attr(*, ".internal.selfref")=<externalptr>
attach(diamonds)
table(color,cut)
##      cut
## color  Good Ideal V.Good
##     D  6566 45175  21460
##     E  9623 55220  28016
##     F  9042 57703  26027
##     G  8804 61569  24990
##     H  7542 55588  22821
##     I  7339 42779  19761
##     J  5316 29322  13840
##     K  3449 14631   7580
##     L  1468  5039   3114
cor(price,carat)
## [1] 0.8563399
#http://stackoverflow.com/questions/31100579/how-to-do-a-crosstab-with-two-categorical-variables-but-populate-it-with-the-mea
with(diamonds, tapply(ppc, list(cut,color), FUN= mean))
##               D        E        F        G        H        I        J
## Good   4641.697 4191.966 4280.496 4338.938 4278.112 3863.085 3623.275
## Ideal  7771.666 6468.105 6547.355 6354.171 6352.541 5932.771 5404.822
## V.Good 5655.073 5280.504 5318.004 5400.332 5069.712 4806.902 4488.938
##               K        L
## Good   3335.944 2814.508
## Ideal  5060.748 3897.145
## V.Good 4196.628 3303.622