ls()
## character(0)
data("AirPassengers")
head(AirPassengers)
## [1] 112 118 132 129 121 135
ls()
## [1] "AirPassengers"
rm(AirPassengers)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 366893 19.6 592000 31.7 460000 24.6
## Vcells 561122 4.3 1308461 10.0 887926 6.8
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
data("iris3")
head(iris3)
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 371239 19.9 750400 40.1 460000 24.6
## Vcells 568183 4.4 1308461 10.0 887926 6.8
memory.limit()
## [1] 8096
memory.size()
## [1] 30.34
data(diamonds,package = "ggplot2")
object.size(diamonds)
## 3456376 bytes
al=c(1,2,3,65,76)
bl=c(1,10)
al*bl
## Warning in al * bl: longer object length is not a multiple of shorter
## object length
## [1] 1 20 3 650 76
al+bl
## Warning in al + bl: longer object length is not a multiple of shorter
## object length
## [1] 2 12 4 75 77
3*al
## [1] 3 6 9 195 228
length(al)
## [1] 5
class(al)
## [1] "numeric"
str(al)
## num [1:5] 1 2 3 65 76
ajay=rep(10,8)
ajay
## [1] 10 10 10 10 10 10 10 10
bjay=seq(1:5)
bjay
## [1] 1 2 3 4 5
newone=c("Babu","Sanjiv","Ajay")
class(newone)
## [1] "character"
substr(newone,1,3)
## [1] "Bab" "San" "Aja"
nchar(newone)
## [1] 4 6 4
substr(newone,nchar(newone)-2,nchar(newone))
## [1] "abu" "jiv" "jay"
library(stringr)
str_to_upper(newone)
## [1] "BABU" "SANJIV" "AJAY"
str_to_lower(newone)
## [1] "babu" "sanjiv" "ajay"
tolower(newone)
## [1] "babu" "sanjiv" "ajay"
toupper(newone)
## [1] "BABU" "SANJIV" "AJAY"
getwd()
## [1] "C:/Users/Dell/Documents/R/revision"
setwd("C:/Users/Dell/Documents")
dir()
## [1] "2017-01-18 07.48 Ajay_s Meeting.mp4"
## [2] "2017-01-26 19.48 Data Analytics Course Weekday Batch .mp4"
## [3] "2017-01-31 19.56 Data Analytics Course Weekday Batch .mp4"
## [4] "2017-02-02 20.04 Data Analytics Course Weekday Batch .mp4"
## [5] "2017-02-07 14.25 Data Analytics Course Weekday Batch .mp4"
## [6] "2017-02-09 20.01 Data Analytics Course Weekday Batch .mp4"
## [7] "2017-02-14 19.45 Data Analytics Course Weekday Batch .mp4"
## [8] "2017-02-16 20.16 Data Analytics Course Weekday Batch .mp4"
## [9] "2017-02-21 19.50 Data Analytics Course Weekday Batch .mp4"
## [10] "2017-02-23 19.56 Data Analytics Course Weekday Batch .mp4"
## [11] "2017-02-23 22.45 Data Analytics Course Weekday Batch .g2m"
## [12] "20170121-Business Analytics(3242874580)"
## [13] "20170128-Business Analytics(3242878477)"
## [14] "ccFraud.csv"
## [15] "chapter 3 _ spark.ipynb"
## [16] "ChatLog Ajay_s Meeting 2017_01_18 07_52.rtf"
## [17] "ChatLog Data Analytics Course Weekday Batch 2017_01_26 21_35.rtf"
## [18] "ChatLog Data Analytics Course Weekday Batch 2017_01_31 21_43.rtf"
## [19] "ChatLog Data Analytics Course Weekday Batch 2017_02_02 21_38.rtf"
## [20] "ChatLog Data Analytics Course Weekday Batch 2017_02_07 16_17.rtf"
## [21] "ChatLog Data Analytics Course Weekday Batch 2017_02_09 21_38.rtf"
## [22] "ChatLog Data Analytics Course Weekday Batch 2017_02_14 21_35.rtf"
## [23] "ChatLog Data Analytics Course Weekday Batch 2017_02_16 21_34.rtf"
## [24] "ChatLog Data Analytics Course Weekday Batch 2017_02_21 21_43.rtf"
## [25] "ChatLog Data Analytics Course Weekday Batch 2017_02_23 21_54.rtf"
## [26] "ChatLog New Meeting 2017_01_18 09_28.rtf"
## [27] "desktop.ini"
## [28] "Fax"
## [29] "GermanCredit.csv"
## [30] "GitHub"
## [31] "mtcars.csv"
## [32] "my first notebook.ipynb"
## [33] "My Music"
## [34] "My Pictures"
## [35] "my second notebook.ipynb"
## [36] "My Tableau Repository"
## [37] "My Videos"
## [38] "notebook-Copy1.ipynb"
## [39] "notebook.ipynb"
## [40] "original"
## [41] "Python Scripts"
## [42] "R"
## [43] "rsconnect"
## [44] "SAS-University-Edition"
## [45] "Scanned Documents"
## [46] "sql.html"
## [47] "sql.R"
## [48] "sqlalchemy.ipynb"
## [49] "stackoverflow-dump-analysis.ipynb"
## [50] "Thumbs.db"
## [51] "time series.ipynb"
## [52] "Untitled.ipynb"
## [53] "untitled.txt"
## [54] "Untitled1.ipynb"
## [55] "Untitled2.ipynb"
## [56] "Untitled3.ipynb"
## [57] "Untitled4.ipynb"
## [58] "Untitled5.ipynb"
## [59] "unvbasicvapp__9411006__ova__en__sp0__1.ova"
dir(pattern = "csv")
## [1] "ccFraud.csv" "GermanCredit.csv" "mtcars.csv"
library("MASS")
dir(pattern = "csv")
## [1] "ccFraud.csv" "GermanCredit.csv" "mtcars.csv"
library(data.table)
library(readr)
system.time(read.csv("ccFraud.csv"))
## user system elapsed
## 78.62 1.71 81.15
system.time(read_csv("ccFraud.csv"))
## Parsed with column specification:
## cols(
## custID = col_integer(),
## gender = col_integer(),
## state = col_integer(),
## cardholder = col_integer(),
## balance = col_integer(),
## numTrans = col_integer(),
## numIntlTrans = col_integer(),
## creditLine = col_integer(),
## fraudRisk = col_integer()
## )
## user system elapsed
## 8.87 0.17 9.10
system.time(fread("ccFraud.csv"))
##
Read 22.4% of 10000000 rows
Read 38.2% of 10000000 rows
Read 54.0% of 10000000 rows
Read 69.6% of 10000000 rows
Read 84.8% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:08
## user system elapsed
## 6.91 0.06 7.05
library(microbenchmark)
microbenchmark(
fread("ccFraud.csv"),
read_csv("ccFraud.csv"),
read.csv("ccFraud.csv"),times=2)
##
Read 15.2% of 10000000 rows
Read 30.1% of 10000000 rows
Read 45.2% of 10000000 rows
Read 58.0% of 10000000 rows
Read 69.0% of 10000000 rows
Read 79.6% of 10000000 rows
Read 90.1% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:09
## Parsed with column specification:
## cols(
## custID = col_integer(),
## gender = col_integer(),
## state = col_integer(),
## cardholder = col_integer(),
## balance = col_integer(),
## numTrans = col_integer(),
## numIntlTrans = col_integer(),
## creditLine = col_integer(),
## fraudRisk = col_integer()
## )
## Parsed with column specification:
## cols(
## custID = col_integer(),
## gender = col_integer(),
## state = col_integer(),
## cardholder = col_integer(),
## balance = col_integer(),
## numTrans = col_integer(),
## numIntlTrans = col_integer(),
## creditLine = col_integer(),
## fraudRisk = col_integer()
## )
##
Read 20.3% of 10000000 rows
Read 35.1% of 10000000 rows
Read 50.4% of 10000000 rows
Read 65.6% of 10000000 rows
Read 80.8% of 10000000 rows
Read 92.3% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:08
## Unit: seconds
## expr min lq mean median uq
## fread("ccFraud.csv") 7.867825 7.867825 8.415806 8.415806 8.963787
## read_csv("ccFraud.csv") 9.363628 9.363628 10.176424 10.176424 10.989221
## read.csv("ccFraud.csv") 76.465238 76.465238 78.071802 78.071802 79.678367
## max neval cld
## 8.963787 2 a
## 10.989221 2 a
## 79.678367 2 b
library(RODBC)
odbcDataSources()
## dBASE Files
## "Microsoft Access dBASE Driver (*.dbf, *.ndx, *.mdx)"
## Excel Files
## "Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)"
## MS Access Database
## "Microsoft Access Driver (*.mdb, *.accdb)"
## PostgreSQL30
## "PostgreSQL ODBC Driver(ANSI)"
chan=odbcConnect("PostgreSQL30","postgres;Password=root;Database=ajay")
sqlTables(chan)
## TABLE_CAT TABLE_SCHEM TABLE_NAME TABLE_TYPE REMARKS
## 1 ajay public sales TABLE
sqlFetch(chan,"sales",max=10)
## customer_id sales date product_id
## 1 10001 5230 2017-02-07 524
## 2 10002 2781 2017-05-12 469
## 3 10003 2083 2016-12-18 917
## 4 10004 214 2015-01-19 354
## 5 10005 9407 2016-09-26 292
## 6 10006 4705 2015-10-17 380
## 7 10007 4729 2016-01-02 469
## 8 10008 7715 2015-09-12 480
## 9 10009 9898 2015-04-05 611
## 10 10010 5797 2015-08-13 959
newsales=sqlFetch(chan,"sales")
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
prodfreq=sqldf("select count(product_id) as ct,product_id from newsales group by product_id")
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
names(prodfreq)
## [1] "ct" "product_id"
prodfreq2=prodfreq[order(-prodfreq$ct),]
head(prodfreq2)
## ct product_id
## 67 4 266
## 118 4 394
## 224 4 667
## 284 4 807
## 19 3 147
## 36 3 195
testdata=c(NA,NA,46,78,98)
mean(testdata)
## [1] NA
table(is.na(testdata))
##
## FALSE TRUE
## 3 2
mean(testdata,na.rm = T)
## [1] 74
testdata
## [1] NA NA 46 78 98
testdata2=na.omit(testdata)
testdata2
## [1] 46 78 98
## attr(,"na.action")
## [1] 1 2
## attr(,"class")
## [1] "omit"
for (i in 1:10){print(i)}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
simplefun=function(x){x^4}
simplefun(10)
## [1] 10000
for (i in 1:10){print(simplefun(i))}
## [1] 1
## [1] 16
## [1] 81
## [1] 256
## [1] 625
## [1] 1296
## [1] 2401
## [1] 4096
## [1] 6561
## [1] 10000
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
lapply(iris,summary)
## $Sepal.Length
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.300 5.100 5.800 5.843 6.400 7.900
##
## $Sepal.Width
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.800 3.000 3.057 3.300 4.400
##
## $Petal.Length
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.600 4.350 3.758 5.100 6.900
##
## $Petal.Width
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.100 0.300 1.300 1.199 1.800 2.500
##
## $Species
## setosa versicolor virginica
## 50 50 50
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
describe(iris)
## iris
##
## 5 Variables 150 Observations
## ---------------------------------------------------------------------------
## Sepal.Length
## n missing distinct Info Mean Gmd .05 .10
## 150 0 35 0.998 5.843 0.9462 4.600 4.800
## .25 .50 .75 .90 .95
## 5.100 5.800 6.400 6.900 7.255
##
## lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
## ---------------------------------------------------------------------------
## Sepal.Width
## n missing distinct Info Mean Gmd .05 .10
## 150 0 23 0.992 3.057 0.4872 2.345 2.500
## .25 .50 .75 .90 .95
## 2.800 3.000 3.300 3.610 3.800
##
## lowest : 2.0 2.2 2.3 2.4 2.5, highest: 3.9 4.0 4.1 4.2 4.4
## ---------------------------------------------------------------------------
## Petal.Length
## n missing distinct Info Mean Gmd .05 .10
## 150 0 43 0.998 3.758 1.979 1.30 1.40
## .25 .50 .75 .90 .95
## 1.60 4.35 5.10 5.80 6.10
##
## lowest : 1.0 1.1 1.2 1.3 1.4, highest: 6.3 6.4 6.6 6.7 6.9
## ---------------------------------------------------------------------------
## Petal.Width
## n missing distinct Info Mean Gmd .05 .10
## 150 0 22 0.99 1.199 0.8676 0.2 0.2
## .25 .50 .75 .90 .95
## 0.3 1.3 1.8 2.2 2.3
##
## lowest : 0.1 0.2 0.3 0.4 0.5, highest: 2.1 2.2 2.3 2.4 2.5
## ---------------------------------------------------------------------------
## Species
## n missing distinct
## 150 0 3
##
## Value setosa versicolor virginica
## Frequency 50 50 50
## Proportion 0.333 0.333 0.333
## ---------------------------------------------------------------------------
lapply(iris,describe)
## $Sepal.Length
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 150 0 35 0.998 5.843 0.9462 4.600 4.800
## .25 .50 .75 .90 .95
## 5.100 5.800 6.400 6.900 7.255
##
## lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
##
## $Sepal.Width
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 150 0 23 0.992 3.057 0.4872 2.345 2.500
## .25 .50 .75 .90 .95
## 2.800 3.000 3.300 3.610 3.800
##
## lowest : 2.0 2.2 2.3 2.4 2.5, highest: 3.9 4.0 4.1 4.2 4.4
##
## $Petal.Length
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 150 0 43 0.998 3.758 1.979 1.30 1.40
## .25 .50 .75 .90 .95
## 1.60 4.35 5.10 5.80 6.10
##
## lowest : 1.0 1.1 1.2 1.3 1.4, highest: 6.3 6.4 6.6 6.7 6.9
##
## $Petal.Width
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 150 0 22 0.99 1.199 0.8676 0.2 0.2
## .25 .50 .75 .90 .95
## 0.3 1.3 1.8 2.2 2.3
##
## lowest : 0.1 0.2 0.3 0.4 0.5, highest: 2.1 2.2 2.3 2.4 2.5
##
## $Species
## X[[i]]
## n missing distinct
## 150 0 3
##
## Value setosa versicolor virginica
## Frequency 50 50 50
## Proportion 0.333 0.333 0.333
summarize(iris$Sepal.Length,iris$Species,summary)
## iris$Species iris.Sepal.Length X1st.Qu. Median Mean X3rd.Qu. Max.
## 1 setosa 4.3 4.800 5.0 5.006 5.2 5.8
## 2 versicolor 4.9 5.600 5.9 5.936 6.3 7.0
## 3 virginica 4.9 6.225 6.5 6.588 6.9 7.9