# Intro to R (Week 3 General Track)
# Download data from http://bitly.com/dsdata
# Unzip the file to get csv file
# Read the file in R.
#
# Answer the following questions and submit R code also
# Which color has maximum price/carat
# Which clarity has minimum price/carat
# Which cut has more missing values
# Which color has minimum median price
# What conclusion can you draw from the variables color, cut price and carat
ls()
## character(0)
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 361790 19.4 592000 31.7 460000 24.6
## Vcells 548493 4.2 1023718 7.9 859315 6.6
memory.size()
## [1] 30.07
memory.limit()
## [1] 8096
getwd()
## [1] "C:/Users/Dell/Dropbox"
setwd("C:/Users/Dell/Desktop")
dir()
## [1] "~$thonajay.docx"
## [2] "BigDiamonds.csv"
## [3] "Class-3-Public-Primary-Certification-Authority.pem.txt"
## [4] "Data Analysis (1)"
## [5] "DataWrangling.pdf"
## [6] "desktop.ini"
## [7] "dump"
## [8] "kushal.jpg"
## [9] "Pythonajay.docx"
## [10] "SUINV.png"
dir(pattern = "csv")
## [1] "BigDiamonds.csv"
library(data.table)
diamonds=fread("BigDiamonds.csv")
##
Read 13.4% of 598024 rows
Read 21.7% of 598024 rows
Read 33.4% of 598024 rows
Read 45.1% of 598024 rows
Read 58.5% of 598024 rows
Read 66.9% of 598024 rows
Read 76.9% of 598024 rows
Read 85.3% of 598024 rows
Read 93.6% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:13
diamonds$ppc=diamonds$price/diamonds$carat
head(diamonds)
## V1 carat cut color clarity table depth cert measurements price
## 1: 1 0.25 V.Good K I1 59 63.7 GIA 3.96 x 3.95 x 2.52 NA
## 2: 2 0.23 Good G I1 61 58.1 GIA 4.00 x 4.05 x 2.30 NA
## 3: 3 0.34 Good J I2 58 58.7 GIA 4.56 x 4.53 x 2.67 NA
## 4: 4 0.21 V.Good D I1 60 60.6 GIA 3.80 x 3.82 x 2.31 NA
## 5: 5 0.31 V.Good K I1 59 62.2 EGL 4.35 x 4.26 x 2.68 NA
## 6: 6 0.20 Good G SI2 60 64.4 GIA 3.74 x 3.67 x 2.38 NA
## x y z ppc
## 1: 3.96 3.95 2.52 NA
## 2: 4.00 4.05 2.30 NA
## 3: 4.56 4.53 2.67 NA
## 4: 3.80 3.82 2.31 NA
## 5: 4.35 4.26 2.68 NA
## 6: 3.74 3.67 2.38 NA
#598024 --593784
diamonds=na.omit(diamonds)
diamonds[,max(ppc),color]
## color V1
## 1: G 32998.33
## 2: K 19516.77
## 3: J 22890.12
## 4: H 31718.95
## 5: F 37084.06
## 6: I 24982.45
## 7: D 49519.40
## 8: E 40871.01
## 9: L 14585.41
diamonds[,min(ppc),clarity]
## clarity V1
## 1: SI1 735.4167
## 2: SI2 726.4706
## 3: VS2 866.6667
## 4: I1 622.2222
## 5: VVS2 1038.4615
## 6: VS1 872.8814
## 7: VVS1 1139.0244
## 8: I2 525.0000
## 9: IF 1093.7500
diamonds2=fread("BigDiamonds.csv")
##
Read 15.0% of 598024 rows
Read 25.1% of 598024 rows
Read 35.1% of 598024 rows
Read 45.1% of 598024 rows
Read 55.2% of 598024 rows
Read 65.2% of 598024 rows
Read 76.9% of 598024 rows
Read 90.3% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:11
table(is.na(diamonds2$cut))
##
## FALSE
## 598024
table(is.na(diamonds2$cert))
##
## FALSE
## 598024
diamonds[,median(ppc),color]
## color V1
## 1: G 4314.286
## 2: K 3851.258
## 3: J 4330.348
## 4: H 4551.111
## 5: F 4124.876
## 6: I 4514.706
## 7: D 4114.000
## 8: E 3836.000
## 9: L 2800.000
str(diamonds)
## Classes 'data.table' and 'data.frame': 593784 obs. of 14 variables:
## $ V1 : chr "494" "495" "496" "497" ...
## $ carat : num 0.24 0.31 0.26 0.24 0.3 0.34 0.2 0.29 0.22 0.25 ...
## $ cut : chr "V.Good" "V.Good" "Good" "Ideal" ...
## $ color : chr "G" "K" "J" "G" ...
## $ clarity : chr "SI1" "SI2" "VS2" "SI1" ...
## $ table : num 61 59 56.5 55 57 66 62 58 62 64 ...
## $ depth : num 58.9 60.2 64.1 61.3 62.2 55 59.1 61.4 59.6 60.5 ...
## $ cert : chr "GIA" "GIA" "IGI" "GIA" ...
## $ measurements: chr "4.09 x 4.10 x 2.41" "4.40 x 4.42 x 2.65" "4.01 x 4.05 x 2.58" "4.01 x 4.03 x 2.47" ...
## $ price : int 300 300 300 300 300 300 301 301 301 301 ...
## $ x : num 4.09 4.4 4.01 4.01 4.21 4.75 3.79 4.25 3.9 4.02 ...
## $ y : num 4.1 4.42 4.05 4.03 4.24 4.61 3.82 4.31 3.93 4.06 ...
## $ z : num 2.41 2.65 2.58 2.47 2.63 2.57 2.25 2.63 2.33 2.44 ...
## $ ppc : num 1250 968 1154 1250 1000 ...
## - attr(*, ".internal.selfref")=<externalptr>
attach(diamonds)
table(color,cut)
## cut
## color Good Ideal V.Good
## D 6566 45175 21460
## E 9623 55220 28016
## F 9042 57703 26027
## G 8804 61569 24990
## H 7542 55588 22821
## I 7339 42779 19761
## J 5316 29322 13840
## K 3449 14631 7580
## L 1468 5039 3114
cor(price,carat)
## [1] 0.8563399
#http://stackoverflow.com/questions/31100579/how-to-do-a-crosstab-with-two-categorical-variables-but-populate-it-with-the-mea
with(diamonds, tapply(ppc, list(cut,color), FUN= mean))
## D E F G H I J
## Good 4641.697 4191.966 4280.496 4338.938 4278.112 3863.085 3623.275
## Ideal 7771.666 6468.105 6547.355 6354.171 6352.541 5932.771 5404.822
## V.Good 5655.073 5280.504 5318.004 5400.332 5069.712 4806.902 4488.938
## K L
## Good 3335.944 2814.508
## Ideal 5060.748 3897.145
## V.Good 4196.628 3303.622