rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 358830 19.2     592000 31.7   460000 24.6
## Vcells 549748  4.2    1023718  7.9   843187  6.5
getwd()
## [1] "/home/ajayohri/Desktop/test"
dir()
##  [1] "arules.html"                        
##  [2] "arules.R"                           
##  [3] "arules.spin.R"                      
##  [4] "arules.spin.Rmd"                    
##  [5] "BigDiamonds.csv"                    
##  [6] "BigDiamonds.csv.zip"                
##  [7] "ccFraud.csv"                        
##  [8] "data manipulation and dviz.R"       
##  [9] "data_manipulation_and_dviz.R"       
## [10] "data_manipulation_and_dviz.spin.R"  
## [11] "data_manipulation_and_dviz.spin.Rmd"
## [12] "decisiontrees.R"                    
## [13] "fraudrisk.R"                        
## [14] "test.Rproj"
library(data.table)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
diamonds=fread("BigDiamonds.csv")
## 
Read 65.2% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:03
fraud=fread("ccFraud.csv")  
## 
Read 0.0% of 10000000 rows
Read 27.2% of 10000000 rows
Read 54.2% of 10000000 rows
Read 81.1% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:07
diamonds2=select(diamonds,cut,clarity,color,price,carat)
summary(diamonds2)
##      cut              clarity             color               price      
##  Length:598024      Length:598024      Length:598024      Min.   :  300  
##  Class :character   Class :character   Class :character   1st Qu.: 1220  
##  Mode  :character   Mode  :character   Mode  :character   Median : 3503  
##                                                           Mean   : 8753  
##                                                           3rd Qu.:11174  
##                                                           Max.   :99990  
##                                                           NA's   :713    
##      carat      
##  Min.   :0.200  
##  1st Qu.:0.500  
##  Median :0.900  
##  Mean   :1.071  
##  3rd Qu.:1.500  
##  Max.   :9.250  
## 
table(diamonds2$cut)
## 
##   Good  Ideal V.Good 
##  59680 369448 168896
table(diamonds2$clarity)
## 
##     I1     I2     IF    SI1    SI2    VS1    VS2   VVS1   VVS2 
##  14524   2302  31157 116631 104300  97730 111082  54798  65500
table(diamonds2$color)
## 
##     D     E     F     G     H     I     J     K     L 
## 73630 93483 93573 96204 86619 70282 48709 25868  9656
diamonds2=na.omit(diamonds2)
rm(diamonds)
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells   434287  23.2    1731918  92.5  1337800  71.5
## Vcells 48786968 372.3   79761136 608.6 66070896 504.1
diamonds2=mutate(diamonds2,pricepercarat=price/carat)
head(diamonds2)
##      cut clarity color price carat pricepercarat
## 1 V.Good     SI1     G   300  0.24     1250.0000
## 2 V.Good     SI2     K   300  0.31      967.7419
## 3   Good     VS2     J   300  0.26     1153.8462
## 4  Ideal     SI1     G   300  0.24     1250.0000
## 5   Good      I1     H   300  0.30     1000.0000
## 6   Good      I1     F   300  0.34      882.3529
diamonds2=data.table(diamonds2)
tables()
##      NAME            NROW NCOL  MB
## [1,] diamonds2    597,311    6  26
## [2,] fraud     10,000,000    9 344
##      COLS                                                                            
## [1,] cut,clarity,color,price,carat,pricepercarat                                     
## [2,] custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRis
##      KEY
## [1,]    
## [2,]    
## Total: 370MB
diamonds2[,max(pricepercarat),.(cut)]
##       cut       V1
## 1: V.Good 49519.40
## 2:   Good 43410.00
## 3:  Ideal 49481.59
diamonds2[,min(pricepercarat),.(cut)]
##       cut       V1
## 1: V.Good 604.7809
## 2:   Good 525.0000
## 3:  Ideal 671.1111
diamonds2[,max(pricepercarat),.(clarity)]
##    clarity       V1
## 1:     SI1 26606.31
## 2:     SI2 20859.43
## 3:     VS2 33196.01
## 4:      I1 11048.43
## 5:    VVS2 38581.30
## 6:     VS1 35737.04
## 7:    VVS1 42412.86
## 8:      I2 10500.00
## 9:      IF 49519.40
diamonds2[,min(pricepercarat),.(clarity)]
##    clarity        V1
## 1:     SI1  735.4167
## 2:     SI2  726.4706
## 3:     VS2  866.6667
## 4:      I1  622.2222
## 5:    VVS2 1038.4615
## 6:     VS1  872.8814
## 7:    VVS1 1139.0244
## 8:      I2  525.0000
## 9:      IF 1093.7500
diamonds2[,max(pricepercarat),.(color)]
##    color       V1
## 1:     G 32998.33
## 2:     K 19516.77
## 3:     J 22890.12
## 4:     H 31718.95
## 5:     F 37084.06
## 6:     I 24982.45
## 7:     D 49519.40
## 8:     E 40871.01
## 9:     L 14585.41
diamonds2[,min(pricepercarat),.(color)]
##    color       V1
## 1:     G 654.2373
## 2:     K 637.6812
## 3:     J 630.6122
## 4:     H 660.8696
## 5:     F 671.1111
## 6:     I 525.0000
## 7:     D 603.8136
## 8:     E 604.7809
## 9:     L 683.3333
diamonds2[,max(pricepercarat),.(cut,color)]
##        cut color       V1
##  1: V.Good     G 32670.72
##  2: V.Good     K 19516.77
##  3:   Good     J 19881.67
##  4:  Ideal     G 32998.33
##  5:   Good     H 24412.53
##  6:   Good     F 33099.67
##  7: V.Good     I 22702.73
##  8:  Ideal     I 24982.45
##  9: V.Good     D 49519.40
## 10: V.Good     H 28426.67
## 11:   Good     I 19340.26
## 12: V.Good     E 33876.97
## 13: V.Good     F 34267.00
## 14:  Ideal     H 31718.95
## 15:   Good     E 33196.01
## 16:   Good     G 29395.68
## 17:  Ideal     F 37084.06
## 18:  Ideal     K 18615.38
## 19: V.Good     J 19616.40
## 20:  Ideal     E 40871.01
## 21:   Good     D 43410.00
## 22:   Good     K 18257.46
## 23:  Ideal     L 14280.08
## 24: V.Good     L 14585.41
## 25:  Ideal     J 22890.12
## 26:  Ideal     D 49481.59
## 27:   Good     L 13622.38
##        cut color       V1
diamonds2[,max(pricepercarat),.(cut,color,clarity)]
##         cut color clarity        V1
##   1: V.Good     G     SI1 19240.232
##   2: V.Good     K     SI2 13977.305
##   3:   Good     J     VS2 19881.673
##   4:  Ideal     G     SI1 22861.098
##   5:   Good     H      I1  6324.176
##  ---                               
## 239:  Ideal     L      I2  1491.111
## 240: V.Good     L      I2  4645.133
## 241:  Ideal     K      I2  3339.109
## 242:   Good     E      IF 31500.498
## 243:   Good     D      IF 43410.000
diamonds3=diamonds2[,max(pricepercarat),.(cut,color,clarity)]
arrange(diamonds3,V1)
##        cut color clarity        V1
## 1    Ideal     L      I2  1491.111
## 2     Good     L      I2  2978.500
## 3     Good     K      I2  3060.648
## 4    Ideal     K      I2  3339.109
## 5     Good     I      I2  3633.333
## 6     Good     L      I1  4018.627
## 7    Ideal     I      I2  4082.446
## 8   V.Good     J      I2  4100.794
## 9    Ideal     J      I2  4100.800
## 10  V.Good     K      I2  4173.740
## 11   Ideal     K      I1  4411.765
## 12    Good     J      I2  4430.932
## 13  V.Good     L      I1  4490.698
## 14  V.Good     L      I2  4645.133
## 15  V.Good     E      I2  5086.923
## 16    Good     K      I1  5184.106
## 17    Good     J      I1  5235.714
## 18    Good     D      I2  5291.905
## 19  V.Good     G      I2  5424.422
## 20  V.Good     I      I2  5460.000
## 21    Good     G      I2  5564.865
## 22    Good     F      I2  5774.797
## 23   Ideal     G      I2  5775.000
## 24  V.Good     F      I2  5775.238
## 25   Ideal     D      I2  5805.333
## 26   Ideal     H      I2  5829.462
## 27    Good     E      I2  5953.731
## 28   Ideal     E      I2  5953.731
## 29  V.Good     D      I2  6021.154
## 30   Ideal     J      I1  6105.413
## 31    Good     H      I1  6324.176
## 32   Ideal     L      I1  6343.750
## 33    Good     L      IF  6423.841
## 34  V.Good     G      I1  6459.375
## 35    Good     D      I1  6633.333
## 36    Good     G      I1  6735.099
## 37  V.Good     E      I1  6736.123
## 38    Good     E      I1  6767.401
## 39    Good     H      I2  6825.000
## 40    Good     F      I1  6853.745
## 41  V.Good     K      I1  6996.705
## 42   Ideal     E      I1  6997.519
## 43    Good     I      I1  7108.000
## 44  V.Good     H      I2  7280.000
## 45   Ideal     D      I1  7471.429
## 46    Good     L     SI2  7487.624
## 47  V.Good     D      I1  7555.024
## 48   Ideal     I      I1  8370.118
## 49  V.Good     H      I1  8716.749
## 50  V.Good     J      I1  9084.063
## 51   Ideal     L     SI2  9172.865
## 52   Ideal     H      I1  9197.342
## 53  V.Good     F      I1  9661.244
## 54   Ideal     G      I1  9692.118
## 55  V.Good     L     SI2  9742.326
## 56    Good     L     SI1  9960.000
## 57   Ideal     F      I2 10500.000
## 58  V.Good     L    VVS1 10559.780
## 59    Good     L    VVS2 10857.143
## 60    Good     K     SI1 10912.903
## 61   Ideal     F      I1 11035.632
## 62  V.Good     I      I1 11048.426
## 63    Good     I      IF 11287.500
## 64    Good     L    VVS1 11440.000
## 65    Good     L     VS2 11532.062
## 66  V.Good     L    VVS2 11543.478
## 67   Ideal     L     VS2 12143.713
## 68   Ideal     K     SI2 12360.396
## 69    Good     K     SI2 12360.624
## 70   Ideal     L    VVS1 12652.174
## 71  V.Good     L     VS1 12905.410
## 72   Ideal     L      IF 12905.637
## 73  V.Good     L     SI1 13146.933
## 74  V.Good     L     VS2 13284.937
## 75    Good     J    VVS1 13396.358
## 76    Good     D     SI1 13402.985
## 77    Good     K     VS1 13469.781
## 78   Ideal     L     SI1 13609.943
## 79    Good     L     VS1 13622.376
## 80    Good     J     SI2 13974.495
## 81  V.Good     K     SI2 13977.305
## 82   Ideal     L    VVS2 14007.045
## 83    Good     G     SI2 14008.306
## 84   Ideal     L     VS1 14280.080
## 85    Good     K     VS2 14339.254
## 86  V.Good     J     SI2 14423.575
## 87  V.Good     K     SI1 14423.689
## 88  V.Good     L      IF 14585.406
## 89    Good     J      IF 14708.491
## 90    Good     D     SI2 14941.079
## 91    Good     K    VVS2 14977.199
## 92    Good     E     SI2 15185.185
## 93  V.Good     K    VVS2 15351.117
## 94  V.Good     I     SI2 15454.890
## 95   Ideal     K     SI1 15479.967
## 96    Good     K    VVS1 15540.066
## 97  V.Good     J     SI1 15994.800
## 98   Ideal     K     VS2 16072.013
## 99    Good     J     SI1 16084.418
## 100 V.Good     E     SI2 16235.409
## 101  Ideal     J     SI2 16335.727
## 102  Ideal     K     VS1 16356.088
## 103   Good     H     SI2 16539.930
## 104 V.Good     D     SI2 16583.665
## 105   Good     E     SI1 16873.754
## 106   Good     I     VS1 17061.224
## 107 V.Good     F     SI2 17306.600
## 108   Good     H     SI1 17443.363
## 109  Ideal     K      IF 17463.636
## 110 V.Good     K    VVS1 17471.906
## 111 V.Good     J    VVS1 17537.698
## 112  Ideal     J     SI1 17646.071
## 113 V.Good     K     VS2 17698.390
## 114  Ideal     K    VVS2 17760.784
## 115 V.Good     K     VS1 17895.409
## 116   Good     I     SI2 17918.327
## 117 V.Good     J    VVS2 18029.700
## 118  Ideal     I     SI2 18081.532
## 119   Good     G     SI1 18165.000
## 120   Good     F     SI2 18257.171
## 121   Good     K      IF 18257.463
## 122 V.Good     J     VS1 18346.608
## 123  Ideal     E     SI2 18430.492
## 124   Good     I     SI1 18502.390
## 125  Ideal     D     SI2 18565.232
## 126  Ideal     G     SI2 18608.268
## 127  Ideal     K    VVS1 18615.385
## 128   Good     I     VS2 18756.000
## 129   Good     F     SI1 18875.000
## 130   Good     J     VS1 19058.708
## 131 V.Good     J     VS2 19231.507
## 132 V.Good     G     SI1 19240.232
## 133  Ideal     F     SI2 19240.828
## 134   Good     I    VVS2 19265.781
## 135 V.Good     G     SI2 19319.493
## 136   Good     I    VVS1 19340.264
## 137   Good     J    VVS2 19509.780
## 138 V.Good     K      IF 19516.766
## 139  Ideal     J      IF 19599.263
## 140 V.Good     J      IF 19616.400
## 141  Ideal     H     SI2 19623.810
## 142  Ideal     J     VS2 19726.190
## 143 V.Good     I     SI1 19788.224
## 144 V.Good     I     VS2 19800.398
## 145  Ideal     I     SI1 19842.914
## 146   Good     J     VS2 19881.673
## 147 V.Good     I    VVS1 20155.388
## 148   Good     H    VVS2 20370.333
## 149  Ideal     J     VS1 20410.256
## 150  Ideal     J    VVS2 20433.741
## 151 V.Good     H     SI1 20758.411
## 152 V.Good     H     SI2 20859.427
## 153 V.Good     I     VS1 20955.860
## 154 V.Good     I      IF 21032.419
## 155   Good     H     VS2 21197.304
## 156  Ideal     H     SI1 21226.402
## 157 V.Good     D     SI1 21350.831
## 158  Ideal     I     VS2 21424.946
## 159 V.Good     I    VVS2 22702.730
## 160  Ideal     I     VS1 22837.349
## 161  Ideal     G     SI1 22861.098
## 162  Ideal     J    VVS1 22890.123
## 163   Good     H      IF 23114.901
## 164 V.Good     E     SI1 23143.216
## 165   Good     F     VS1 23219.900
## 166   Good     H    VVS1 23246.667
## 167   Good     G      IF 23459.801
## 168 V.Good     F     SI1 23615.062
## 169  Ideal     I    VVS2 23906.716
## 170  Ideal     F     SI1 24342.537
## 171   Good     H     VS1 24412.531
## 172  Ideal     I    VVS1 24795.184
## 173  Ideal     I      IF 24982.450
## 174 V.Good     H     VS1 25557.895
## 175  Ideal     E     SI1 25714.098
## 176   Good     G     VS2 25725.000
## 177 V.Good     G     VS2 25844.503
## 178 V.Good     H     VS2 25946.023
## 179 V.Good     H    VVS1 26154.485
## 180  Ideal     D     SI1 26606.312
## 181   Good     G     VS1 26760.133
## 182 V.Good     H      IF 26943.333
## 183  Ideal     H     VS1 27252.778
## 184   Good     E     VS2 27259.136
## 185 V.Good     F    VVS1 27542.289
## 186  Ideal     G     VS2 27696.348
## 187   Good     D     VS1 27926.910
## 188   Good     D    VVS1 27929.851
## 189  Ideal     H     VS2 28167.391
## 190   Good     E    VVS2 28262.458
## 191   Good     G    VVS1 28317.881
## 192 V.Good     H    VVS2 28426.667
## 193  Ideal     H    VVS2 29391.429
## 194   Good     D    VVS2 29393.377
## 195   Good     G    VVS2 29395.681
## 196   Good     F     VS2 29416.667
## 197 V.Good     G    VVS1 30343.333
## 198   Good     F    VVS1 30772.757
## 199 V.Good     G     VS1 30887.417
## 200   Good     F      IF 30985.854
## 201 V.Good     F     VS2 31003.279
## 202 V.Good     D    VVS1 31310.000
## 203  Ideal     H      IF 31320.792
## 204   Good     E      IF 31500.498
## 205  Ideal     F     VS2 31566.013
## 206   Good     E    VVS1 31646.358
## 207  Ideal     H    VVS1 31718.954
## 208  Ideal     G    VVS2 31798.671
## 209 V.Good     G      IF 31904.967
## 210 V.Good     F    VVS2 32136.667
## 211 V.Good     D    VVS2 32263.184
## 212 V.Good     D     VS2 32268.212
## 213   Good     D     VS2 32280.399
## 214  Ideal     G     VS1 32299.003
## 215 V.Good     G    VVS2 32670.724
## 216 V.Good     F     VS1 32670.861
## 217 V.Good     D     VS1 32744.262
## 218  Ideal     G      IF 32843.234
## 219 V.Good     E     VS2 32867.881
## 220  Ideal     F     VS1 32980.399
## 221  Ideal     G    VVS1 32998.333
## 222  Ideal     E     VS1 33016.611
## 223  Ideal     E     VS2 33036.213
## 224   Good     F    VVS2 33099.668
## 225 V.Good     E    VVS2 33122.924
## 226   Good     E     VS1 33196.013
## 227  Ideal     D     VS2 33196.013
## 228 V.Good     E     VS1 33211.296
## 229  Ideal     E    VVS2 33402.917
## 230  Ideal     F    VVS2 33408.244
## 231  Ideal     F    VVS1 33702.198
## 232 V.Good     E    VVS1 33800.000
## 233 V.Good     E      IF 33876.974
## 234 V.Good     F      IF 34266.995
## 235  Ideal     D     VS1 35737.037
## 236  Ideal     F      IF 37084.058
## 237  Ideal     D    VVS2 38581.301
## 238  Ideal     E    VVS1 38938.000
## 239  Ideal     E      IF 40871.014
## 240  Ideal     D    VVS1 42412.857
## 241   Good     D      IF 43410.000
## 242  Ideal     D      IF 49481.592
## 243 V.Good     D      IF 49519.403
names(fraud)
## [1] "custID"       "gender"       "state"        "cardholder"  
## [5] "balance"      "numTrans"     "numIntlTrans" "creditLine"  
## [9] "fraudRisk"
summary(fraud)
##      custID            gender          state         cardholder  
##  Min.   :1.0e+00   Min.   :1.000   Min.   : 1.00   Min.   :1.00  
##  1st Qu.:2.5e+06   1st Qu.:1.000   1st Qu.:10.00   1st Qu.:1.00  
##  Median :5.0e+06   Median :1.000   Median :24.00   Median :1.00  
##  Mean   :5.0e+06   Mean   :1.382   Mean   :24.66   Mean   :1.03  
##  3rd Qu.:7.5e+06   3rd Qu.:2.000   3rd Qu.:38.00   3rd Qu.:1.00  
##  Max.   :1.0e+07   Max.   :2.000   Max.   :51.00   Max.   :2.00  
##     balance         numTrans       numIntlTrans      creditLine    
##  Min.   :    0   Min.   :  0.00   Min.   : 0.000   Min.   : 1.000  
##  1st Qu.:    0   1st Qu.: 10.00   1st Qu.: 0.000   1st Qu.: 4.000  
##  Median : 3706   Median : 19.00   Median : 0.000   Median : 6.000  
##  Mean   : 4110   Mean   : 28.94   Mean   : 4.047   Mean   : 9.134  
##  3rd Qu.: 6000   3rd Qu.: 39.00   3rd Qu.: 4.000   3rd Qu.:11.000  
##  Max.   :41485   Max.   :100.00   Max.   :60.000   Max.   :75.000  
##    fraudRisk     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.0596  
##  3rd Qu.:0.0000  
##  Max.   :1.0000
attach(fraud)
barplot(table(fraudRisk))

barplot(table(gender))

barplot(table(state))

boxplot(balance)

hist(balance)

boxplot(balance~gender)

plot(fraudRisk,gender)