ls()
## character(0)
#ls() and top right - environment in rstudio
rm(LTV)
## Warning in rm(LTV): object 'LTV' not found
rm(list=ls())
#install.packages('sqldf') or bottom right Packages - install
#install.packages('sqldf')

workex=c(14,3,19,10,2,3,3.8)
mean(workex)
## [1] 7.828571
median(workex)
## [1] 3.8
education=c("M","B","B","M","M","D","B")
str(workex)
##  num [1:7] 14 3 19 10 2 3 3.8
str(education)
##  chr [1:7] "M" "B" "B" "M" "M" "D" "B"
workex
## [1] 14.0  3.0 19.0 10.0  2.0  3.0  3.8
Rclass=NULL
Rclass$education=education
Rclass$workex=workex
Rclass
## $education
## [1] "M" "B" "B" "M" "M" "D" "B"
## 
## $workex
## [1] 14.0  3.0 19.0 10.0  2.0  3.0  3.8
str(Rclass)
## List of 2
##  $ education: chr [1:7] "M" "B" "B" "M" ...
##  $ workex   : num [1:7] 14 3 19 10 2 3 3.8
workstring=as.character(workex)
workstring
## [1] "14"  "3"   "19"  "10"  "2"   "3"   "3.8"
Rclass=Rclass=as.data.frame(Rclass)
str(Rclass)
## 'data.frame':    7 obs. of  2 variables:
##  $ education: Factor w/ 3 levels "B","D","M": 3 1 1 3 3 2 1
##  $ workex   : num  14 3 19 10 2 3 3.8
summary(Rclass)
##  education     workex      
##  B:3       Min.   : 2.000  
##  D:1       1st Qu.: 3.000  
##  M:3       Median : 3.800  
##            Mean   : 7.829  
##            3rd Qu.:12.000  
##            Max.   :19.000
plot(density(Rclass$workex))

Rclass
##   education workex
## 1         M   14.0
## 2         B    3.0
## 3         B   19.0
## 4         M   10.0
## 5         M    2.0
## 6         D    3.0
## 7         B    3.8
Rclass[4,]
##   education workex
## 4         M     10
Rclass[,2]
## [1] 14.0  3.0 19.0 10.0  2.0  3.0  3.8
Rclass[4,1]
## [1] M
## Levels: B D M
Rclass[4,2]
## [1] 10
Rclass[Rclass$education=='B',2]
## [1]  3.0 19.0  3.8
#mean(Rclass$workex where Rclass$education=='B')
mean(Rclass[Rclass$education=='B',2])
## [1] 8.6
# DATA FROM http://bit.ly/dsdata 

getwd()
## [1] "C:/Users/KOGENTIX/Desktop"
setwd('C:/Users/KOGENTIX/Desktop')
dir(pattern = 'csv')
## [1] "BigDiamonds.csv"     "BigDiamonds.csv.zip"
BigDiamonds=read.csv("BigDiamonds.csv")
rm(BigDiamonds)
library(readr)
BigDiamonds <- read_csv("C:/Users/KOGENTIX/Desktop/BigDiamonds.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   carat = col_double(),
##   cut = col_character(),
##   color = col_character(),
##   clarity = col_character(),
##   table = col_double(),
##   depth = col_double(),
##   cert = col_character(),
##   measurements = col_character(),
##   price = col_integer(),
##   x = col_double(),
##   y = col_double(),
##   z = col_double()
## )
rm(BigDiamonds)
library(data.table)
BigDiamonds <- fread("C:/Users/KOGENTIX/Desktop/BigDiamonds.csv")
## 
Read 38.5% of 598024 rows
Read 60.2% of 598024 rows
Read 81.9% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:05
str(BigDiamonds)
## Classes 'data.table' and 'data.frame':   598024 obs. of  13 variables:
##  $ V1          : chr  "1" "2" "3" "4" ...
##  $ carat       : num  0.25 0.23 0.34 0.21 0.31 0.2 0.2 0.22 0.23 0.2 ...
##  $ cut         : chr  "V.Good" "Good" "Good" "V.Good" ...
##  $ color       : chr  "K" "G" "J" "D" ...
##  $ clarity     : chr  "I1" "I1" "I2" "I1" ...
##  $ table       : num  59 61 58 60 59 60 63 61 57.5 65 ...
##  $ depth       : num  63.7 58.1 58.7 60.6 62.2 64.4 62.6 59.2 63.6 54.9 ...
##  $ cert        : chr  "GIA" "GIA" "GIA" "GIA" ...
##  $ measurements: chr  "3.96 x 3.95 x 2.52" "4.00 x 4.05 x 2.30" "4.56 x 4.53 x 2.67" "3.80 x 3.82 x 2.31" ...
##  $ price       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ x           : num  3.96 4 4.56 3.8 4.35 3.74 3.72 3.95 3.87 3.83 ...
##  $ y           : num  3.95 4.05 4.53 3.82 4.26 3.67 3.65 3.97 3.9 4 ...
##  $ z           : num  2.52 2.3 2.67 2.31 2.68 2.38 2.31 2.34 2.47 2.14 ...
##  - attr(*, ".internal.selfref")=<externalptr>
BigDiamonds$V1=NULL #deleting a column

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
test1=sqldf('select * from BigDiamonds where cert="GIA" ')

#str and summary first steps for data.frame

summary(BigDiamonds)
##      carat           cut               color             clarity         
##  Min.   :0.200   Length:598024      Length:598024      Length:598024     
##  1st Qu.:0.500   Class :character   Class :character   Class :character  
##  Median :0.900   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1.071                                                           
##  3rd Qu.:1.500                                                           
##  Max.   :9.250                                                           
##                                                                          
##      table           depth           cert           measurements      
##  Min.   : 0.00   Min.   : 0.00   Length:598024      Length:598024     
##  1st Qu.:56.00   1st Qu.:61.00   Class :character   Class :character  
##  Median :58.00   Median :62.10   Mode  :character   Mode  :character  
##  Mean   :57.63   Mean   :61.06                                        
##  3rd Qu.:59.00   3rd Qu.:62.70                                        
##  Max.   :75.90   Max.   :81.30                                        
##                                                                       
##      price             x                y                z         
##  Min.   :  300   Min.   : 0.150   Min.   : 1.000   Min.   : 0.040  
##  1st Qu.: 1220   1st Qu.: 4.740   1st Qu.: 4.970   1st Qu.: 3.120  
##  Median : 3503   Median : 5.780   Median : 6.050   Median : 3.860  
##  Mean   : 8753   Mean   : 5.991   Mean   : 6.199   Mean   : 4.033  
##  3rd Qu.:11174   3rd Qu.: 6.970   3rd Qu.: 7.230   3rd Qu.: 4.610  
##  Max.   :99990   Max.   :13.890   Max.   :13.890   Max.   :13.180  
##  NA's   :713     NA's   :1815     NA's   :1852     NA's   :2544
mean(BigDiamonds$price,na.rm=T)# ignore missing
## [1] 8753.018
price2=na.omit(BigDiamonds$price) #omit missing
mean(price2)
## [1] 8753.018
BigDiamonds$price[1:100]
##   [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [24] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [47] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [70] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [93] NA NA NA NA NA NA NA NA
is.na(BigDiamonds$price[1:100])
##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [99] TRUE TRUE
table(is.na(BigDiamonds$price))
## 
##  FALSE   TRUE 
## 597311    713
table(BigDiamonds$cert)
## 
##        AGS        EGL  EGL Intl. EGL ISRAEL    EGL USA        GIA 
##       2958      33814      11447      11301      16079     463555 
##        HRD        IGI      OTHER 
##       9936      43667       5267
table(BigDiamonds$cut)
## 
##   Good  Ideal V.Good 
##  59680 369448 168896
table(BigDiamonds$color)
## 
##     D     E     F     G     H     I     J     K     L 
## 73630 93483 93573 96204 86619 70282 48709 25868  9656
#FIND OUT 

#BigDiamonds[i,j,k] i selection j operation k groupby
BigDiamonds[price>3000, ,]
##         carat    cut color clarity table depth cert
##      1:  0.70 V.Good     G     VS2  60.0  62.2  GIA
##      2:  0.61  Ideal     F      IF  58.0  62.4  GIA
##      3:  0.85  Ideal     J     VS2  58.0  61.3  GIA
##      4:  0.61  Ideal     G      IF  57.0  60.8  GIA
##      5:  0.70   Good     E    VVS2  59.0  63.2  GIA
##     ---                                            
## 320820:  3.02  Ideal     E    VVS2  58.0  59.8  HRD
## 320821:  5.01 V.Good     I    VVS2  63.5  61.5  IGI
## 320822:  3.43  Ideal     F     VS2  54.0  62.7  GIA
## 320823:  3.01 V.Good     E     VS1  58.0  62.9  GIA
## 320824:  4.13  Ideal     H      IF  56.0  62.5  IGI
##                     measurements price     x     y    z
##      1:       5.70 x 5.68 x 3.54  3001  5.70  5.68 3.54
##      2:       5.42 x 5.38 x 3.37  3001  5.42  5.38 3.37
##      3:       6.08 x 6.06 x 3.72  3001  6.08  6.06 3.72
##      4:       5.50 x 5.53 x 3.35  3001  5.50  5.53 3.35
##      5:       5.54 x 5.56 x 3.51  3001  5.54  5.56 3.51
##     ---                                                
## 320820:       9.43 x 9.51 x 5.66 99930  9.43  9.51 5.66
## 320821: 10.78  x  10.89  x  6.68 99942 10.78 10.89 6.68
## 320822:       9.66 x 9.61 x 6.05 99960  9.66  9.61 6.05
## 320823:       9.15 x 9.19 x 5.77 99966  9.15  9.19 5.77
## 320824:      10.27 x 10.19 x 6.4 99990 10.27 10.19 6.40
BigDiamonds[price>3000,mean(price,na.rm = T),]
## [1] 15144.52
BigDiamonds[price>3000,mean(price,na.rm = T),cut ]
##       cut       V1
## 1: V.Good 13473.27
## 2:  Ideal 16287.45
## 3:   Good 10933.93
BigDiamonds[price>3000,mean(price,na.rm = T),color ]
##    color       V1
## 1:     G 15206.97
## 2:     F 15234.45
## 3:     J 14572.55
## 4:     E 14784.08
## 5:     I 14644.23
## 6:     H 15619.51
## 7:     D 16126.97
## 8:     K 15015.71
## 9:     L 12961.05
BigDiamonds[price>3000,mean(price,na.rm = T),.(color,cut) ]
##     color    cut        V1
##  1:     G V.Good 13912.689
##  2:     F  Ideal 16560.190
##  3:     J  Ideal 15106.234
##  4:     G  Ideal 16127.682
##  5:     E   Good  9562.663
##  6:     I   Good 10568.739
##  7:     I V.Good 13373.244
##  8:     E V.Good 12512.359
##  9:     H V.Good 13546.145
## 10:     G   Good 11217.941
## 11:     F V.Good 13153.974
## 12:     E  Ideal 16549.568
## 13:     I  Ideal 15601.083
## 14:     H  Ideal 16766.347
## 15:     D  Ideal 17991.784
## 16:     D V.Good 13063.586
## 17:     J   Good 10946.462
## 18:     K  Ideal 15217.389
## 19:     F   Good 11114.473
## 20:     H   Good 11490.488
## 21:     K V.Good 15361.029
## 22:     J V.Good 14305.686
## 23:     D   Good 10355.956
## 24:     L  Ideal 12607.748
## 25:     L V.Good 13453.077
## 26:     K   Good 12485.608
## 27:     L   Good 13843.234
##     color    cut        V1
BigDiamonds[price>3000 & carat>2,mean(price,na.rm = T),color ]
##    color       V1
## 1:     G 33954.66
## 2:     E 38434.23
## 3:     H 31931.87
## 4:     J 28098.96
## 5:     D 39269.05
## 6:     I 28453.21
## 7:     K 24667.76
## 8:     F 36323.07
## 9:     L 22561.79
#FIND OUT  CHEAPEST DIAMOND BELONGS TO WHICH COLOR
BigDiamonds[ ,mean(price,na.rm = T),color ]
##    color       V1
## 1:     K 9694.257
## 2:     G 8984.200
## 3:     J 9423.581
## 4:     D 8266.346
## 5:     F 8234.730
## 6:     E 7282.990
## 7:     H 9941.795
## 8:     I 9541.319
## 9:     L 7109.228
BigDiamonds[ ,mean(price,na.rm = T),.(color,cut) ]
##     color    cut        V1
##  1:     K V.Good  8865.398
##  2:     G   Good  5661.293
##  3:     J   Good  5609.094
##  4:     D V.Good  6443.311
##  5:     F   Good  5070.773
##  6:     F V.Good  7003.693
##  7:     G V.Good  8075.610
##  8:     J V.Good  8394.493
##  9:     E   Good  4185.086
## 10:     D   Good  4656.791
## 11:     E  Ideal  8341.317
## 12:     E V.Good  6259.823
## 13:     H V.Good  8114.545
## 14:     F  Ideal  9285.867
## 15:     H   Good  6152.773
## 16:     I   Good  5540.319
## 17:     K   Good  5752.414
## 18:     I V.Good  8031.530
## 19:     L   Good  5156.013
## 20:     I  Ideal 10923.938
## 21:     G  Ideal  9826.261
## 22:     D  Ideal  9654.869
## 23:     J  Ideal 10599.596
## 24:     H  Ideal 11203.491
## 25:     L V.Good  6542.255
## 26:     K  Ideal 11050.691
## 27:     L  Ideal  8027.597
##     color    cut        V1
ans2=BigDiamonds[ ,mean(price,na.rm = T),.(color,cut) ]
ans2
##     color    cut        V1
##  1:     K V.Good  8865.398
##  2:     G   Good  5661.293
##  3:     J   Good  5609.094
##  4:     D V.Good  6443.311
##  5:     F   Good  5070.773
##  6:     F V.Good  7003.693
##  7:     G V.Good  8075.610
##  8:     J V.Good  8394.493
##  9:     E   Good  4185.086
## 10:     D   Good  4656.791
## 11:     E  Ideal  8341.317
## 12:     E V.Good  6259.823
## 13:     H V.Good  8114.545
## 14:     F  Ideal  9285.867
## 15:     H   Good  6152.773
## 16:     I   Good  5540.319
## 17:     K   Good  5752.414
## 18:     I V.Good  8031.530
## 19:     L   Good  5156.013
## 20:     I  Ideal 10923.938
## 21:     G  Ideal  9826.261
## 22:     D  Ideal  9654.869
## 23:     J  Ideal 10599.596
## 24:     H  Ideal 11203.491
## 25:     L V.Good  6542.255
## 26:     K  Ideal 11050.691
## 27:     L  Ideal  8027.597
##     color    cut        V1
ans2[ans2$V1==min(ans2$V1),]
##    color  cut       V1
## 1:     E Good 4185.086
#FIND OUT  CHEAPEST DIAMOND BELONGS TO WHICH COLOR AND CUT COMBINATION