ls()
## character(0)
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 359198 19.2 592000 31.7 460000 24.6
## Vcells 549275 4.2 1023718 7.9 841234 6.5
getwd()
## [1] "/home/ajayohri/Desktop/test"
setwd("/home/ajayohri/Desktop/test/")
dir()
## [1] "BigDiamonds.csv" "BigDiamonds.csv.zip" "ccFraud.csv"
## [4] "homework3" "R.R" "R.spin.R"
## [7] "R.spin.Rmd"
library(data.table)
diamonds=fread("BigDiamonds.csv")
##
Read 71.9% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:03
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
diamonds2=select(diamonds,price,carat,cut,color,clarity)
rm(diamonds)
diamonds2=na.omit(diamonds2)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 425534 22.8 1416599 75.7 1770749 94.6
## Vcells 3770225 28.8 11995632 91.6 13849290 105.7
diamonds2=mutate(diamonds2,unitprice=price/carat)
class(diamonds2)
## [1] "data.frame"
diamonds2=data.table(diamonds2)
diamonds2[,max(unitprice),.(cut)]
## cut V1
## 1: V.Good 49519.40
## 2: Good 43410.00
## 3: Ideal 49481.59
diamonds2[,max(unitprice),.(color)]
## color V1
## 1: G 32998.33
## 2: K 19516.77
## 3: J 22890.12
## 4: H 31718.95
## 5: F 37084.06
## 6: I 24982.45
## 7: D 49519.40
## 8: E 40871.01
## 9: L 14585.41
diamonds2[,max(unitprice),.(clarity)]
## clarity V1
## 1: SI1 26606.31
## 2: SI2 20859.43
## 3: VS2 33196.01
## 4: I1 11048.43
## 5: VVS2 38581.30
## 6: VS1 35737.04
## 7: VVS1 42412.86
## 8: I2 10500.00
## 9: IF 49519.40
diamonds2[,min(unitprice),.(cut)]
## cut V1
## 1: V.Good 604.7809
## 2: Good 525.0000
## 3: Ideal 671.1111
diamonds2[,min(unitprice),.(color)]
## color V1
## 1: G 654.2373
## 2: K 637.6812
## 3: J 630.6122
## 4: H 660.8696
## 5: F 671.1111
## 6: I 525.0000
## 7: D 603.8136
## 8: E 604.7809
## 9: L 683.3333
diamonds2[,min(unitprice),.(clarity)]
## clarity V1
## 1: SI1 735.4167
## 2: SI2 726.4706
## 3: VS2 866.6667
## 4: I1 622.2222
## 5: VVS2 1038.4615
## 6: VS1 872.8814
## 7: VVS1 1139.0244
## 8: I2 525.0000
## 9: IF 1093.7500
dir()
## [1] "BigDiamonds.csv" "BigDiamonds.csv.zip" "ccFraud.csv"
## [4] "homework3" "R.R" "R.spin.R"
## [7] "R.spin.Rmd"
fraud=fread("ccFraud.csv")
##
Read 30.8% of 10000000 rows
Read 57.0% of 10000000 rows
Read 83.2% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:05
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
names(fraud)
## [1] "custID" "gender" "state" "cardholder"
## [5] "balance" "numTrans" "numIntlTrans" "creditLine"
## [9] "fraudRisk"
b=sqldf("select avg(balance),state from fraud group by state")
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
b
## avg(balance) state
## 1 4154.930 1
## 2 4114.235 2
## 3 4142.211 3
## 4 4095.427 4
## 5 4116.938 5
## 6 4119.131 6
## 7 4115.194 7
## 8 4155.620 8
## 9 4100.105 9
## 10 4118.133 10
## 11 4119.510 11
## 12 4129.981 12
## 13 4113.976 13
## 14 4121.694 14
## 15 4105.597 15
## 16 4116.501 16
## 17 4091.558 17
## 18 4103.891 18
## 19 4099.715 19
## 20 4106.809 20
## 21 4107.426 21
## 22 4111.475 22
## 23 4108.556 23
## 24 4106.756 24
## 25 4101.034 25
## 26 4108.759 26
## 27 4128.886 27
## 28 4141.067 28
## 29 4093.955 29
## 30 4137.102 30
## 31 4058.545 31
## 32 4102.911 32
## 33 4097.649 33
## 34 4135.900 34
## 35 4103.357 35
## 36 4099.521 36
## 37 4106.725 37
## 38 4110.217 38
## 39 4106.832 39
## 40 4164.285 40
## 41 4119.070 41
## 42 4118.298 42
## 43 4109.054 43
## 44 4108.587 44
## 45 4105.399 45
## 46 4101.074 46
## 47 4087.891 47
## 48 4113.602 48
## 49 4103.754 49
## 50 4104.925 50
## 51 4147.824 51
max(b$`avg(balance)`)
## [1] 4164.285
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data("Boston")
Boston2=Boston[,c("medv","chas")]
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
attach(Boston2)
summarize(medv,chas,summary)
## chas medv X1st.Qu. Median Mean X3rd.Qu. Max.
## 1 0 5.0 16.6 20.9 22.09 24.80 50
## 2 1 13.4 21.1 23.3 28.44 33.15 50