ls()
## character(0)
rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 359198 19.2     592000 31.7   460000 24.6
## Vcells 549275  4.2    1023718  7.9   841234  6.5
getwd()
## [1] "/home/ajayohri/Desktop/test"
setwd("/home/ajayohri/Desktop/test/")

dir()
## [1] "BigDiamonds.csv"     "BigDiamonds.csv.zip" "ccFraud.csv"        
## [4] "homework3"           "R.R"                 "R.spin.R"           
## [7] "R.spin.Rmd"
library(data.table)
diamonds=fread("BigDiamonds.csv")
## 
Read 71.9% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:03
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
diamonds2=select(diamonds,price,carat,cut,color,clarity)
rm(diamonds)
diamonds2=na.omit(diamonds2)
gc()
##           used (Mb) gc trigger (Mb) max used  (Mb)
## Ncells  425534 22.8    1416599 75.7  1770749  94.6
## Vcells 3770225 28.8   11995632 91.6 13849290 105.7
diamonds2=mutate(diamonds2,unitprice=price/carat)
class(diamonds2)
## [1] "data.frame"
diamonds2=data.table(diamonds2)
diamonds2[,max(unitprice),.(cut)] 
##       cut       V1
## 1: V.Good 49519.40
## 2:   Good 43410.00
## 3:  Ideal 49481.59
diamonds2[,max(unitprice),.(color)] 
##    color       V1
## 1:     G 32998.33
## 2:     K 19516.77
## 3:     J 22890.12
## 4:     H 31718.95
## 5:     F 37084.06
## 6:     I 24982.45
## 7:     D 49519.40
## 8:     E 40871.01
## 9:     L 14585.41
diamonds2[,max(unitprice),.(clarity)]
##    clarity       V1
## 1:     SI1 26606.31
## 2:     SI2 20859.43
## 3:     VS2 33196.01
## 4:      I1 11048.43
## 5:    VVS2 38581.30
## 6:     VS1 35737.04
## 7:    VVS1 42412.86
## 8:      I2 10500.00
## 9:      IF 49519.40
diamonds2[,min(unitprice),.(cut)] 
##       cut       V1
## 1: V.Good 604.7809
## 2:   Good 525.0000
## 3:  Ideal 671.1111
diamonds2[,min(unitprice),.(color)] 
##    color       V1
## 1:     G 654.2373
## 2:     K 637.6812
## 3:     J 630.6122
## 4:     H 660.8696
## 5:     F 671.1111
## 6:     I 525.0000
## 7:     D 603.8136
## 8:     E 604.7809
## 9:     L 683.3333
diamonds2[,min(unitprice),.(clarity)] 
##    clarity        V1
## 1:     SI1  735.4167
## 2:     SI2  726.4706
## 3:     VS2  866.6667
## 4:      I1  622.2222
## 5:    VVS2 1038.4615
## 6:     VS1  872.8814
## 7:    VVS1 1139.0244
## 8:      I2  525.0000
## 9:      IF 1093.7500
#2)
dir()
## [1] "BigDiamonds.csv"     "BigDiamonds.csv.zip" "ccFraud.csv"        
## [4] "homework3"           "R.R"                 "R.spin.R"           
## [7] "R.spin.Rmd"
fraud=fread("ccFraud.csv")
## 
Read 30.8% of 10000000 rows
Read 57.0% of 10000000 rows
Read 83.2% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:05
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
names(fraud)
## [1] "custID"       "gender"       "state"        "cardholder"  
## [5] "balance"      "numTrans"     "numIntlTrans" "creditLine"  
## [9] "fraudRisk"
b=sqldf("select avg(balance),state from fraud group by state")
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
b
##    avg(balance) state
## 1      4154.930     1
## 2      4114.235     2
## 3      4142.211     3
## 4      4095.427     4
## 5      4116.938     5
## 6      4119.131     6
## 7      4115.194     7
## 8      4155.620     8
## 9      4100.105     9
## 10     4118.133    10
## 11     4119.510    11
## 12     4129.981    12
## 13     4113.976    13
## 14     4121.694    14
## 15     4105.597    15
## 16     4116.501    16
## 17     4091.558    17
## 18     4103.891    18
## 19     4099.715    19
## 20     4106.809    20
## 21     4107.426    21
## 22     4111.475    22
## 23     4108.556    23
## 24     4106.756    24
## 25     4101.034    25
## 26     4108.759    26
## 27     4128.886    27
## 28     4141.067    28
## 29     4093.955    29
## 30     4137.102    30
## 31     4058.545    31
## 32     4102.911    32
## 33     4097.649    33
## 34     4135.900    34
## 35     4103.357    35
## 36     4099.521    36
## 37     4106.725    37
## 38     4110.217    38
## 39     4106.832    39
## 40     4164.285    40
## 41     4119.070    41
## 42     4118.298    42
## 43     4109.054    43
## 44     4108.587    44
## 45     4105.399    45
## 46     4101.074    46
## 47     4087.891    47
## 48     4113.602    48
## 49     4103.754    49
## 50     4104.925    50
## 51     4147.824    51
max(b$`avg(balance)`) #State 40
## [1] 4164.285
#3
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
data("Boston")
Boston2=Boston[,c("medv","chas")]
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
attach(Boston2)
summarize(medv,chas,summary)
##   chas medv X1st.Qu. Median  Mean X3rd.Qu. Max.
## 1    0  5.0     16.6   20.9 22.09    24.80   50
## 2    1 13.4     21.1   23.3 28.44    33.15   50
#4
#https://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html