rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 358830 19.2 592000 31.7 460000 24.6
## Vcells 549748 4.2 1023718 7.9 843187 6.5
getwd()
## [1] "/home/ajayohri/Desktop/test"
dir()
## [1] "arules.html"
## [2] "arules.R"
## [3] "arules.spin.R"
## [4] "arules.spin.Rmd"
## [5] "BigDiamonds.csv"
## [6] "BigDiamonds.csv.zip"
## [7] "ccFraud.csv"
## [8] "data manipulation and dviz.R"
## [9] "data_manipulation_and_dviz.R"
## [10] "data_manipulation_and_dviz.spin.R"
## [11] "data_manipulation_and_dviz.spin.Rmd"
## [12] "decisiontrees.R"
## [13] "fraudrisk.R"
## [14] "test.Rproj"
library(data.table)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
diamonds=fread("BigDiamonds.csv")
##
Read 65.2% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:03
fraud=fread("ccFraud.csv")
##
Read 0.0% of 10000000 rows
Read 27.2% of 10000000 rows
Read 54.2% of 10000000 rows
Read 81.1% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:07
diamonds2=select(diamonds,cut,clarity,color,price,carat)
summary(diamonds2)
## cut clarity color price
## Length:598024 Length:598024 Length:598024 Min. : 300
## Class :character Class :character Class :character 1st Qu.: 1220
## Mode :character Mode :character Mode :character Median : 3503
## Mean : 8753
## 3rd Qu.:11174
## Max. :99990
## NA's :713
## carat
## Min. :0.200
## 1st Qu.:0.500
## Median :0.900
## Mean :1.071
## 3rd Qu.:1.500
## Max. :9.250
##
table(diamonds2$cut)
##
## Good Ideal V.Good
## 59680 369448 168896
table(diamonds2$clarity)
##
## I1 I2 IF SI1 SI2 VS1 VS2 VVS1 VVS2
## 14524 2302 31157 116631 104300 97730 111082 54798 65500
table(diamonds2$color)
##
## D E F G H I J K L
## 73630 93483 93573 96204 86619 70282 48709 25868 9656
diamonds2=na.omit(diamonds2)
rm(diamonds)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 434287 23.2 1731918 92.5 1337800 71.5
## Vcells 48786968 372.3 79761136 608.6 66070896 504.1
diamonds2=mutate(diamonds2,pricepercarat=price/carat)
head(diamonds2)
## cut clarity color price carat pricepercarat
## 1 V.Good SI1 G 300 0.24 1250.0000
## 2 V.Good SI2 K 300 0.31 967.7419
## 3 Good VS2 J 300 0.26 1153.8462
## 4 Ideal SI1 G 300 0.24 1250.0000
## 5 Good I1 H 300 0.30 1000.0000
## 6 Good I1 F 300 0.34 882.3529
diamonds2=data.table(diamonds2)
tables()
## NAME NROW NCOL MB
## [1,] diamonds2 597,311 6 26
## [2,] fraud 10,000,000 9 344
## COLS
## [1,] cut,clarity,color,price,carat,pricepercarat
## [2,] custID,gender,state,cardholder,balance,numTrans,numIntlTrans,creditLine,fraudRis
## KEY
## [1,]
## [2,]
## Total: 370MB
diamonds2[,max(pricepercarat),.(cut)]
## cut V1
## 1: V.Good 49519.40
## 2: Good 43410.00
## 3: Ideal 49481.59
diamonds2[,min(pricepercarat),.(cut)]
## cut V1
## 1: V.Good 604.7809
## 2: Good 525.0000
## 3: Ideal 671.1111
diamonds2[,max(pricepercarat),.(clarity)]
## clarity V1
## 1: SI1 26606.31
## 2: SI2 20859.43
## 3: VS2 33196.01
## 4: I1 11048.43
## 5: VVS2 38581.30
## 6: VS1 35737.04
## 7: VVS1 42412.86
## 8: I2 10500.00
## 9: IF 49519.40
diamonds2[,min(pricepercarat),.(clarity)]
## clarity V1
## 1: SI1 735.4167
## 2: SI2 726.4706
## 3: VS2 866.6667
## 4: I1 622.2222
## 5: VVS2 1038.4615
## 6: VS1 872.8814
## 7: VVS1 1139.0244
## 8: I2 525.0000
## 9: IF 1093.7500
diamonds2[,max(pricepercarat),.(color)]
## color V1
## 1: G 32998.33
## 2: K 19516.77
## 3: J 22890.12
## 4: H 31718.95
## 5: F 37084.06
## 6: I 24982.45
## 7: D 49519.40
## 8: E 40871.01
## 9: L 14585.41
diamonds2[,min(pricepercarat),.(color)]
## color V1
## 1: G 654.2373
## 2: K 637.6812
## 3: J 630.6122
## 4: H 660.8696
## 5: F 671.1111
## 6: I 525.0000
## 7: D 603.8136
## 8: E 604.7809
## 9: L 683.3333
diamonds2[,max(pricepercarat),.(cut,color)]
## cut color V1
## 1: V.Good G 32670.72
## 2: V.Good K 19516.77
## 3: Good J 19881.67
## 4: Ideal G 32998.33
## 5: Good H 24412.53
## 6: Good F 33099.67
## 7: V.Good I 22702.73
## 8: Ideal I 24982.45
## 9: V.Good D 49519.40
## 10: V.Good H 28426.67
## 11: Good I 19340.26
## 12: V.Good E 33876.97
## 13: V.Good F 34267.00
## 14: Ideal H 31718.95
## 15: Good E 33196.01
## 16: Good G 29395.68
## 17: Ideal F 37084.06
## 18: Ideal K 18615.38
## 19: V.Good J 19616.40
## 20: Ideal E 40871.01
## 21: Good D 43410.00
## 22: Good K 18257.46
## 23: Ideal L 14280.08
## 24: V.Good L 14585.41
## 25: Ideal J 22890.12
## 26: Ideal D 49481.59
## 27: Good L 13622.38
## cut color V1
diamonds2[,max(pricepercarat),.(cut,color,clarity)]
## cut color clarity V1
## 1: V.Good G SI1 19240.232
## 2: V.Good K SI2 13977.305
## 3: Good J VS2 19881.673
## 4: Ideal G SI1 22861.098
## 5: Good H I1 6324.176
## ---
## 239: Ideal L I2 1491.111
## 240: V.Good L I2 4645.133
## 241: Ideal K I2 3339.109
## 242: Good E IF 31500.498
## 243: Good D IF 43410.000
diamonds3=diamonds2[,max(pricepercarat),.(cut,color,clarity)]
arrange(diamonds3,V1)
## cut color clarity V1
## 1 Ideal L I2 1491.111
## 2 Good L I2 2978.500
## 3 Good K I2 3060.648
## 4 Ideal K I2 3339.109
## 5 Good I I2 3633.333
## 6 Good L I1 4018.627
## 7 Ideal I I2 4082.446
## 8 V.Good J I2 4100.794
## 9 Ideal J I2 4100.800
## 10 V.Good K I2 4173.740
## 11 Ideal K I1 4411.765
## 12 Good J I2 4430.932
## 13 V.Good L I1 4490.698
## 14 V.Good L I2 4645.133
## 15 V.Good E I2 5086.923
## 16 Good K I1 5184.106
## 17 Good J I1 5235.714
## 18 Good D I2 5291.905
## 19 V.Good G I2 5424.422
## 20 V.Good I I2 5460.000
## 21 Good G I2 5564.865
## 22 Good F I2 5774.797
## 23 Ideal G I2 5775.000
## 24 V.Good F I2 5775.238
## 25 Ideal D I2 5805.333
## 26 Ideal H I2 5829.462
## 27 Good E I2 5953.731
## 28 Ideal E I2 5953.731
## 29 V.Good D I2 6021.154
## 30 Ideal J I1 6105.413
## 31 Good H I1 6324.176
## 32 Ideal L I1 6343.750
## 33 Good L IF 6423.841
## 34 V.Good G I1 6459.375
## 35 Good D I1 6633.333
## 36 Good G I1 6735.099
## 37 V.Good E I1 6736.123
## 38 Good E I1 6767.401
## 39 Good H I2 6825.000
## 40 Good F I1 6853.745
## 41 V.Good K I1 6996.705
## 42 Ideal E I1 6997.519
## 43 Good I I1 7108.000
## 44 V.Good H I2 7280.000
## 45 Ideal D I1 7471.429
## 46 Good L SI2 7487.624
## 47 V.Good D I1 7555.024
## 48 Ideal I I1 8370.118
## 49 V.Good H I1 8716.749
## 50 V.Good J I1 9084.063
## 51 Ideal L SI2 9172.865
## 52 Ideal H I1 9197.342
## 53 V.Good F I1 9661.244
## 54 Ideal G I1 9692.118
## 55 V.Good L SI2 9742.326
## 56 Good L SI1 9960.000
## 57 Ideal F I2 10500.000
## 58 V.Good L VVS1 10559.780
## 59 Good L VVS2 10857.143
## 60 Good K SI1 10912.903
## 61 Ideal F I1 11035.632
## 62 V.Good I I1 11048.426
## 63 Good I IF 11287.500
## 64 Good L VVS1 11440.000
## 65 Good L VS2 11532.062
## 66 V.Good L VVS2 11543.478
## 67 Ideal L VS2 12143.713
## 68 Ideal K SI2 12360.396
## 69 Good K SI2 12360.624
## 70 Ideal L VVS1 12652.174
## 71 V.Good L VS1 12905.410
## 72 Ideal L IF 12905.637
## 73 V.Good L SI1 13146.933
## 74 V.Good L VS2 13284.937
## 75 Good J VVS1 13396.358
## 76 Good D SI1 13402.985
## 77 Good K VS1 13469.781
## 78 Ideal L SI1 13609.943
## 79 Good L VS1 13622.376
## 80 Good J SI2 13974.495
## 81 V.Good K SI2 13977.305
## 82 Ideal L VVS2 14007.045
## 83 Good G SI2 14008.306
## 84 Ideal L VS1 14280.080
## 85 Good K VS2 14339.254
## 86 V.Good J SI2 14423.575
## 87 V.Good K SI1 14423.689
## 88 V.Good L IF 14585.406
## 89 Good J IF 14708.491
## 90 Good D SI2 14941.079
## 91 Good K VVS2 14977.199
## 92 Good E SI2 15185.185
## 93 V.Good K VVS2 15351.117
## 94 V.Good I SI2 15454.890
## 95 Ideal K SI1 15479.967
## 96 Good K VVS1 15540.066
## 97 V.Good J SI1 15994.800
## 98 Ideal K VS2 16072.013
## 99 Good J SI1 16084.418
## 100 V.Good E SI2 16235.409
## 101 Ideal J SI2 16335.727
## 102 Ideal K VS1 16356.088
## 103 Good H SI2 16539.930
## 104 V.Good D SI2 16583.665
## 105 Good E SI1 16873.754
## 106 Good I VS1 17061.224
## 107 V.Good F SI2 17306.600
## 108 Good H SI1 17443.363
## 109 Ideal K IF 17463.636
## 110 V.Good K VVS1 17471.906
## 111 V.Good J VVS1 17537.698
## 112 Ideal J SI1 17646.071
## 113 V.Good K VS2 17698.390
## 114 Ideal K VVS2 17760.784
## 115 V.Good K VS1 17895.409
## 116 Good I SI2 17918.327
## 117 V.Good J VVS2 18029.700
## 118 Ideal I SI2 18081.532
## 119 Good G SI1 18165.000
## 120 Good F SI2 18257.171
## 121 Good K IF 18257.463
## 122 V.Good J VS1 18346.608
## 123 Ideal E SI2 18430.492
## 124 Good I SI1 18502.390
## 125 Ideal D SI2 18565.232
## 126 Ideal G SI2 18608.268
## 127 Ideal K VVS1 18615.385
## 128 Good I VS2 18756.000
## 129 Good F SI1 18875.000
## 130 Good J VS1 19058.708
## 131 V.Good J VS2 19231.507
## 132 V.Good G SI1 19240.232
## 133 Ideal F SI2 19240.828
## 134 Good I VVS2 19265.781
## 135 V.Good G SI2 19319.493
## 136 Good I VVS1 19340.264
## 137 Good J VVS2 19509.780
## 138 V.Good K IF 19516.766
## 139 Ideal J IF 19599.263
## 140 V.Good J IF 19616.400
## 141 Ideal H SI2 19623.810
## 142 Ideal J VS2 19726.190
## 143 V.Good I SI1 19788.224
## 144 V.Good I VS2 19800.398
## 145 Ideal I SI1 19842.914
## 146 Good J VS2 19881.673
## 147 V.Good I VVS1 20155.388
## 148 Good H VVS2 20370.333
## 149 Ideal J VS1 20410.256
## 150 Ideal J VVS2 20433.741
## 151 V.Good H SI1 20758.411
## 152 V.Good H SI2 20859.427
## 153 V.Good I VS1 20955.860
## 154 V.Good I IF 21032.419
## 155 Good H VS2 21197.304
## 156 Ideal H SI1 21226.402
## 157 V.Good D SI1 21350.831
## 158 Ideal I VS2 21424.946
## 159 V.Good I VVS2 22702.730
## 160 Ideal I VS1 22837.349
## 161 Ideal G SI1 22861.098
## 162 Ideal J VVS1 22890.123
## 163 Good H IF 23114.901
## 164 V.Good E SI1 23143.216
## 165 Good F VS1 23219.900
## 166 Good H VVS1 23246.667
## 167 Good G IF 23459.801
## 168 V.Good F SI1 23615.062
## 169 Ideal I VVS2 23906.716
## 170 Ideal F SI1 24342.537
## 171 Good H VS1 24412.531
## 172 Ideal I VVS1 24795.184
## 173 Ideal I IF 24982.450
## 174 V.Good H VS1 25557.895
## 175 Ideal E SI1 25714.098
## 176 Good G VS2 25725.000
## 177 V.Good G VS2 25844.503
## 178 V.Good H VS2 25946.023
## 179 V.Good H VVS1 26154.485
## 180 Ideal D SI1 26606.312
## 181 Good G VS1 26760.133
## 182 V.Good H IF 26943.333
## 183 Ideal H VS1 27252.778
## 184 Good E VS2 27259.136
## 185 V.Good F VVS1 27542.289
## 186 Ideal G VS2 27696.348
## 187 Good D VS1 27926.910
## 188 Good D VVS1 27929.851
## 189 Ideal H VS2 28167.391
## 190 Good E VVS2 28262.458
## 191 Good G VVS1 28317.881
## 192 V.Good H VVS2 28426.667
## 193 Ideal H VVS2 29391.429
## 194 Good D VVS2 29393.377
## 195 Good G VVS2 29395.681
## 196 Good F VS2 29416.667
## 197 V.Good G VVS1 30343.333
## 198 Good F VVS1 30772.757
## 199 V.Good G VS1 30887.417
## 200 Good F IF 30985.854
## 201 V.Good F VS2 31003.279
## 202 V.Good D VVS1 31310.000
## 203 Ideal H IF 31320.792
## 204 Good E IF 31500.498
## 205 Ideal F VS2 31566.013
## 206 Good E VVS1 31646.358
## 207 Ideal H VVS1 31718.954
## 208 Ideal G VVS2 31798.671
## 209 V.Good G IF 31904.967
## 210 V.Good F VVS2 32136.667
## 211 V.Good D VVS2 32263.184
## 212 V.Good D VS2 32268.212
## 213 Good D VS2 32280.399
## 214 Ideal G VS1 32299.003
## 215 V.Good G VVS2 32670.724
## 216 V.Good F VS1 32670.861
## 217 V.Good D VS1 32744.262
## 218 Ideal G IF 32843.234
## 219 V.Good E VS2 32867.881
## 220 Ideal F VS1 32980.399
## 221 Ideal G VVS1 32998.333
## 222 Ideal E VS1 33016.611
## 223 Ideal E VS2 33036.213
## 224 Good F VVS2 33099.668
## 225 V.Good E VVS2 33122.924
## 226 Good E VS1 33196.013
## 227 Ideal D VS2 33196.013
## 228 V.Good E VS1 33211.296
## 229 Ideal E VVS2 33402.917
## 230 Ideal F VVS2 33408.244
## 231 Ideal F VVS1 33702.198
## 232 V.Good E VVS1 33800.000
## 233 V.Good E IF 33876.974
## 234 V.Good F IF 34266.995
## 235 Ideal D VS1 35737.037
## 236 Ideal F IF 37084.058
## 237 Ideal D VVS2 38581.301
## 238 Ideal E VVS1 38938.000
## 239 Ideal E IF 40871.014
## 240 Ideal D VVS1 42412.857
## 241 Good D IF 43410.000
## 242 Ideal D IF 49481.592
## 243 V.Good D IF 49519.403
names(fraud)
## [1] "custID" "gender" "state" "cardholder"
## [5] "balance" "numTrans" "numIntlTrans" "creditLine"
## [9] "fraudRisk"
summary(fraud)
## custID gender state cardholder
## Min. :1.0e+00 Min. :1.000 Min. : 1.00 Min. :1.00
## 1st Qu.:2.5e+06 1st Qu.:1.000 1st Qu.:10.00 1st Qu.:1.00
## Median :5.0e+06 Median :1.000 Median :24.00 Median :1.00
## Mean :5.0e+06 Mean :1.382 Mean :24.66 Mean :1.03
## 3rd Qu.:7.5e+06 3rd Qu.:2.000 3rd Qu.:38.00 3rd Qu.:1.00
## Max. :1.0e+07 Max. :2.000 Max. :51.00 Max. :2.00
## balance numTrans numIntlTrans creditLine
## Min. : 0 Min. : 0.00 Min. : 0.000 Min. : 1.000
## 1st Qu.: 0 1st Qu.: 10.00 1st Qu.: 0.000 1st Qu.: 4.000
## Median : 3706 Median : 19.00 Median : 0.000 Median : 6.000
## Mean : 4110 Mean : 28.94 Mean : 4.047 Mean : 9.134
## 3rd Qu.: 6000 3rd Qu.: 39.00 3rd Qu.: 4.000 3rd Qu.:11.000
## Max. :41485 Max. :100.00 Max. :60.000 Max. :75.000
## fraudRisk
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.0596
## 3rd Qu.:0.0000
## Max. :1.0000
attach(fraud)
barplot(table(fraudRisk))

barplot(table(gender))

barplot(table(state))

boxplot(balance)

hist(balance)

boxplot(balance~gender)

plot(fraudRisk,gender)
