ls()
## character(0)
#ls() and top right - environment in rstudio
rm(LTV)
## Warning in rm(LTV): object 'LTV' not found
rm(list=ls())
#install.packages('sqldf') or bottom right Packages - install
#install.packages('sqldf')
workex=c(14,3,19,10,2,3,3.8)
mean(workex)
## [1] 7.828571
median(workex)
## [1] 3.8
education=c("M","B","B","M","M","D","B")
str(workex)
## num [1:7] 14 3 19 10 2 3 3.8
str(education)
## chr [1:7] "M" "B" "B" "M" "M" "D" "B"
workex
## [1] 14.0 3.0 19.0 10.0 2.0 3.0 3.8
Rclass=NULL
Rclass$education=education
Rclass$workex=workex
Rclass
## $education
## [1] "M" "B" "B" "M" "M" "D" "B"
##
## $workex
## [1] 14.0 3.0 19.0 10.0 2.0 3.0 3.8
str(Rclass)
## List of 2
## $ education: chr [1:7] "M" "B" "B" "M" ...
## $ workex : num [1:7] 14 3 19 10 2 3 3.8
workstring=as.character(workex)
workstring
## [1] "14" "3" "19" "10" "2" "3" "3.8"
Rclass=Rclass=as.data.frame(Rclass)
str(Rclass)
## 'data.frame': 7 obs. of 2 variables:
## $ education: Factor w/ 3 levels "B","D","M": 3 1 1 3 3 2 1
## $ workex : num 14 3 19 10 2 3 3.8
summary(Rclass)
## education workex
## B:3 Min. : 2.000
## D:1 1st Qu.: 3.000
## M:3 Median : 3.800
## Mean : 7.829
## 3rd Qu.:12.000
## Max. :19.000
plot(density(Rclass$workex))

Rclass
## education workex
## 1 M 14.0
## 2 B 3.0
## 3 B 19.0
## 4 M 10.0
## 5 M 2.0
## 6 D 3.0
## 7 B 3.8
Rclass[4,]
## education workex
## 4 M 10
Rclass[,2]
## [1] 14.0 3.0 19.0 10.0 2.0 3.0 3.8
Rclass[4,1]
## [1] M
## Levels: B D M
Rclass[4,2]
## [1] 10
Rclass[Rclass$education=='B',2]
## [1] 3.0 19.0 3.8
#mean(Rclass$workex where Rclass$education=='B')
mean(Rclass[Rclass$education=='B',2])
## [1] 8.6
# DATA FROM http://bit.ly/dsdata
getwd()
## [1] "C:/Users/KOGENTIX/Desktop"
setwd('C:/Users/KOGENTIX/Desktop')
dir(pattern = 'csv')
## [1] "BigDiamonds.csv" "BigDiamonds.csv.zip"
BigDiamonds=read.csv("BigDiamonds.csv")
rm(BigDiamonds)
library(readr)
BigDiamonds <- read_csv("C:/Users/KOGENTIX/Desktop/BigDiamonds.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## carat = col_double(),
## cut = col_character(),
## color = col_character(),
## clarity = col_character(),
## table = col_double(),
## depth = col_double(),
## cert = col_character(),
## measurements = col_character(),
## price = col_integer(),
## x = col_double(),
## y = col_double(),
## z = col_double()
## )
rm(BigDiamonds)
library(data.table)
BigDiamonds <- fread("C:/Users/KOGENTIX/Desktop/BigDiamonds.csv")
##
Read 38.5% of 598024 rows
Read 60.2% of 598024 rows
Read 81.9% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:05
str(BigDiamonds)
## Classes 'data.table' and 'data.frame': 598024 obs. of 13 variables:
## $ V1 : chr "1" "2" "3" "4" ...
## $ carat : num 0.25 0.23 0.34 0.21 0.31 0.2 0.2 0.22 0.23 0.2 ...
## $ cut : chr "V.Good" "Good" "Good" "V.Good" ...
## $ color : chr "K" "G" "J" "D" ...
## $ clarity : chr "I1" "I1" "I2" "I1" ...
## $ table : num 59 61 58 60 59 60 63 61 57.5 65 ...
## $ depth : num 63.7 58.1 58.7 60.6 62.2 64.4 62.6 59.2 63.6 54.9 ...
## $ cert : chr "GIA" "GIA" "GIA" "GIA" ...
## $ measurements: chr "3.96 x 3.95 x 2.52" "4.00 x 4.05 x 2.30" "4.56 x 4.53 x 2.67" "3.80 x 3.82 x 2.31" ...
## $ price : int NA NA NA NA NA NA NA NA NA NA ...
## $ x : num 3.96 4 4.56 3.8 4.35 3.74 3.72 3.95 3.87 3.83 ...
## $ y : num 3.95 4.05 4.53 3.82 4.26 3.67 3.65 3.97 3.9 4 ...
## $ z : num 2.52 2.3 2.67 2.31 2.68 2.38 2.31 2.34 2.47 2.14 ...
## - attr(*, ".internal.selfref")=<externalptr>
BigDiamonds$V1=NULL #deleting a column
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
test1=sqldf('select * from BigDiamonds where cert="GIA" ')
#str and summary first steps for data.frame
summary(BigDiamonds)
## carat cut color clarity
## Min. :0.200 Length:598024 Length:598024 Length:598024
## 1st Qu.:0.500 Class :character Class :character Class :character
## Median :0.900 Mode :character Mode :character Mode :character
## Mean :1.071
## 3rd Qu.:1.500
## Max. :9.250
##
## table depth cert measurements
## Min. : 0.00 Min. : 0.00 Length:598024 Length:598024
## 1st Qu.:56.00 1st Qu.:61.00 Class :character Class :character
## Median :58.00 Median :62.10 Mode :character Mode :character
## Mean :57.63 Mean :61.06
## 3rd Qu.:59.00 3rd Qu.:62.70
## Max. :75.90 Max. :81.30
##
## price x y z
## Min. : 300 Min. : 0.150 Min. : 1.000 Min. : 0.040
## 1st Qu.: 1220 1st Qu.: 4.740 1st Qu.: 4.970 1st Qu.: 3.120
## Median : 3503 Median : 5.780 Median : 6.050 Median : 3.860
## Mean : 8753 Mean : 5.991 Mean : 6.199 Mean : 4.033
## 3rd Qu.:11174 3rd Qu.: 6.970 3rd Qu.: 7.230 3rd Qu.: 4.610
## Max. :99990 Max. :13.890 Max. :13.890 Max. :13.180
## NA's :713 NA's :1815 NA's :1852 NA's :2544
mean(BigDiamonds$price,na.rm=T)# ignore missing
## [1] 8753.018
price2=na.omit(BigDiamonds$price) #omit missing
mean(price2)
## [1] 8753.018
BigDiamonds$price[1:100]
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [24] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [47] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [70] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [93] NA NA NA NA NA NA NA NA
is.na(BigDiamonds$price[1:100])
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [99] TRUE TRUE
table(is.na(BigDiamonds$price))
##
## FALSE TRUE
## 597311 713
table(BigDiamonds$cert)
##
## AGS EGL EGL Intl. EGL ISRAEL EGL USA GIA
## 2958 33814 11447 11301 16079 463555
## HRD IGI OTHER
## 9936 43667 5267
table(BigDiamonds$cut)
##
## Good Ideal V.Good
## 59680 369448 168896
table(BigDiamonds$color)
##
## D E F G H I J K L
## 73630 93483 93573 96204 86619 70282 48709 25868 9656
#FIND OUT
#BigDiamonds[i,j,k] i selection j operation k groupby
BigDiamonds[price>3000, ,]
## carat cut color clarity table depth cert
## 1: 0.70 V.Good G VS2 60.0 62.2 GIA
## 2: 0.61 Ideal F IF 58.0 62.4 GIA
## 3: 0.85 Ideal J VS2 58.0 61.3 GIA
## 4: 0.61 Ideal G IF 57.0 60.8 GIA
## 5: 0.70 Good E VVS2 59.0 63.2 GIA
## ---
## 320820: 3.02 Ideal E VVS2 58.0 59.8 HRD
## 320821: 5.01 V.Good I VVS2 63.5 61.5 IGI
## 320822: 3.43 Ideal F VS2 54.0 62.7 GIA
## 320823: 3.01 V.Good E VS1 58.0 62.9 GIA
## 320824: 4.13 Ideal H IF 56.0 62.5 IGI
## measurements price x y z
## 1: 5.70 x 5.68 x 3.54 3001 5.70 5.68 3.54
## 2: 5.42 x 5.38 x 3.37 3001 5.42 5.38 3.37
## 3: 6.08 x 6.06 x 3.72 3001 6.08 6.06 3.72
## 4: 5.50 x 5.53 x 3.35 3001 5.50 5.53 3.35
## 5: 5.54 x 5.56 x 3.51 3001 5.54 5.56 3.51
## ---
## 320820: 9.43 x 9.51 x 5.66 99930 9.43 9.51 5.66
## 320821: 10.78 x 10.89 x 6.68 99942 10.78 10.89 6.68
## 320822: 9.66 x 9.61 x 6.05 99960 9.66 9.61 6.05
## 320823: 9.15 x 9.19 x 5.77 99966 9.15 9.19 5.77
## 320824: 10.27 x 10.19 x 6.4 99990 10.27 10.19 6.40
BigDiamonds[price>3000,mean(price,na.rm = T),]
## [1] 15144.52
BigDiamonds[price>3000,mean(price,na.rm = T),cut ]
## cut V1
## 1: V.Good 13473.27
## 2: Ideal 16287.45
## 3: Good 10933.93
BigDiamonds[price>3000,mean(price,na.rm = T),color ]
## color V1
## 1: G 15206.97
## 2: F 15234.45
## 3: J 14572.55
## 4: E 14784.08
## 5: I 14644.23
## 6: H 15619.51
## 7: D 16126.97
## 8: K 15015.71
## 9: L 12961.05
BigDiamonds[price>3000,mean(price,na.rm = T),.(color,cut) ]
## color cut V1
## 1: G V.Good 13912.689
## 2: F Ideal 16560.190
## 3: J Ideal 15106.234
## 4: G Ideal 16127.682
## 5: E Good 9562.663
## 6: I Good 10568.739
## 7: I V.Good 13373.244
## 8: E V.Good 12512.359
## 9: H V.Good 13546.145
## 10: G Good 11217.941
## 11: F V.Good 13153.974
## 12: E Ideal 16549.568
## 13: I Ideal 15601.083
## 14: H Ideal 16766.347
## 15: D Ideal 17991.784
## 16: D V.Good 13063.586
## 17: J Good 10946.462
## 18: K Ideal 15217.389
## 19: F Good 11114.473
## 20: H Good 11490.488
## 21: K V.Good 15361.029
## 22: J V.Good 14305.686
## 23: D Good 10355.956
## 24: L Ideal 12607.748
## 25: L V.Good 13453.077
## 26: K Good 12485.608
## 27: L Good 13843.234
## color cut V1
BigDiamonds[price>3000 & carat>2,mean(price,na.rm = T),color ]
## color V1
## 1: G 33954.66
## 2: E 38434.23
## 3: H 31931.87
## 4: J 28098.96
## 5: D 39269.05
## 6: I 28453.21
## 7: K 24667.76
## 8: F 36323.07
## 9: L 22561.79
#FIND OUT CHEAPEST DIAMOND BELONGS TO WHICH COLOR
BigDiamonds[ ,mean(price,na.rm = T),color ]
## color V1
## 1: K 9694.257
## 2: G 8984.200
## 3: J 9423.581
## 4: D 8266.346
## 5: F 8234.730
## 6: E 7282.990
## 7: H 9941.795
## 8: I 9541.319
## 9: L 7109.228
BigDiamonds[ ,mean(price,na.rm = T),.(color,cut) ]
## color cut V1
## 1: K V.Good 8865.398
## 2: G Good 5661.293
## 3: J Good 5609.094
## 4: D V.Good 6443.311
## 5: F Good 5070.773
## 6: F V.Good 7003.693
## 7: G V.Good 8075.610
## 8: J V.Good 8394.493
## 9: E Good 4185.086
## 10: D Good 4656.791
## 11: E Ideal 8341.317
## 12: E V.Good 6259.823
## 13: H V.Good 8114.545
## 14: F Ideal 9285.867
## 15: H Good 6152.773
## 16: I Good 5540.319
## 17: K Good 5752.414
## 18: I V.Good 8031.530
## 19: L Good 5156.013
## 20: I Ideal 10923.938
## 21: G Ideal 9826.261
## 22: D Ideal 9654.869
## 23: J Ideal 10599.596
## 24: H Ideal 11203.491
## 25: L V.Good 6542.255
## 26: K Ideal 11050.691
## 27: L Ideal 8027.597
## color cut V1
ans2=BigDiamonds[ ,mean(price,na.rm = T),.(color,cut) ]
ans2
## color cut V1
## 1: K V.Good 8865.398
## 2: G Good 5661.293
## 3: J Good 5609.094
## 4: D V.Good 6443.311
## 5: F Good 5070.773
## 6: F V.Good 7003.693
## 7: G V.Good 8075.610
## 8: J V.Good 8394.493
## 9: E Good 4185.086
## 10: D Good 4656.791
## 11: E Ideal 8341.317
## 12: E V.Good 6259.823
## 13: H V.Good 8114.545
## 14: F Ideal 9285.867
## 15: H Good 6152.773
## 16: I Good 5540.319
## 17: K Good 5752.414
## 18: I V.Good 8031.530
## 19: L Good 5156.013
## 20: I Ideal 10923.938
## 21: G Ideal 9826.261
## 22: D Ideal 9654.869
## 23: J Ideal 10599.596
## 24: H Ideal 11203.491
## 25: L V.Good 6542.255
## 26: K Ideal 11050.691
## 27: L Ideal 8027.597
## color cut V1
ans2[ans2$V1==min(ans2$V1),]
## color cut V1
## 1: E Good 4185.086
#FIND OUT CHEAPEST DIAMOND BELONGS TO WHICH COLOR AND CUT COMBINATION