revision.R

#This is a revision
ls()

## character(0)

data("AirPassengers")
head(AirPassengers)

## [1] 112 118 132 129 121 135

ls()

## [1] "AirPassengers"

rm(AirPassengers)
gc()

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 366893 19.6     592000 31.7   460000 24.6
## Vcells 561122  4.3    1308461 10.0   887926  6.8

data(iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

data("iris3")
head(iris3)

## [1] 5.1 4.9 4.7 4.6 5.0 5.4

rm(list = ls())
gc()

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 371239 19.9     750400 40.1   460000 24.6
## Vcells 568183  4.4    1308461 10.0   887926  6.8

memory.limit()

## [1] 8096

memory.size()

## [1] 30.34

data(diamonds,package = "ggplot2")
object.size(diamonds)

## 3456376 bytes

#Vectors

al=c(1,2,3,65,76)
bl=c(1,10)
al*bl

## Warning in al * bl: longer object length is not a multiple of shorter
## object length

## [1]   1  20   3 650  76

al+bl

## Warning in al + bl: longer object length is not a multiple of shorter
## object length

## [1]  2 12  4 75 77

3*al

## [1]   3   6   9 195 228

length(al)

## [1] 5

class(al)

## [1] "numeric"

str(al)

##  num [1:5] 1 2 3 65 76

ajay=rep(10,8)
ajay

## [1] 10 10 10 10 10 10 10 10

bjay=seq(1:5)
bjay

## [1] 1 2 3 4 5

newone=c("Babu","Sanjiv","Ajay")
class(newone)

## [1] "character"

substr(newone,1,3)

## [1] "Bab" "San" "Aja"

nchar(newone)

## [1] 4 6 4

substr(newone,nchar(newone)-2,nchar(newone))

## [1] "abu" "jiv" "jay"

library(stringr)
str_to_upper(newone)

## [1] "BABU"   "SANJIV" "AJAY"

str_to_lower(newone)

## [1] "babu"   "sanjiv" "ajay"

tolower(newone)

## [1] "babu"   "sanjiv" "ajay"

toupper(newone)

## [1] "BABU"   "SANJIV" "AJAY"

#File Operations
getwd()

## [1] "C:/Users/Dell/Documents/R/revision"

setwd("C:/Users/Dell/Documents")  #one forward slash=/ for paths 
dir()

##  [1] "2017-01-18 07.48 Ajay_s Meeting.mp4"                              
##  [2] "2017-01-26 19.48 Data Analytics Course Weekday Batch .mp4"        
##  [3] "2017-01-31 19.56 Data Analytics Course Weekday Batch .mp4"        
##  [4] "2017-02-02 20.04 Data Analytics Course Weekday Batch .mp4"        
##  [5] "2017-02-07 14.25 Data Analytics Course Weekday Batch .mp4"        
##  [6] "2017-02-09 20.01 Data Analytics Course Weekday Batch .mp4"        
##  [7] "2017-02-14 19.45 Data Analytics Course Weekday Batch .mp4"        
##  [8] "2017-02-16 20.16 Data Analytics Course Weekday Batch .mp4"        
##  [9] "2017-02-21 19.50 Data Analytics Course Weekday Batch .mp4"        
## [10] "2017-02-23 19.56 Data Analytics Course Weekday Batch .mp4"        
## [11] "2017-02-23 22.45 Data Analytics Course Weekday Batch .g2m"        
## [12] "20170121-Business Analytics(3242874580)"                          
## [13] "20170128-Business Analytics(3242878477)"                          
## [14] "ccFraud.csv"                                                      
## [15] "chapter 3 _ spark.ipynb"                                          
## [16] "ChatLog Ajay_s Meeting 2017_01_18 07_52.rtf"                      
## [17] "ChatLog Data Analytics Course Weekday Batch  2017_01_26 21_35.rtf"
## [18] "ChatLog Data Analytics Course Weekday Batch  2017_01_31 21_43.rtf"
## [19] "ChatLog Data Analytics Course Weekday Batch  2017_02_02 21_38.rtf"
## [20] "ChatLog Data Analytics Course Weekday Batch  2017_02_07 16_17.rtf"
## [21] "ChatLog Data Analytics Course Weekday Batch  2017_02_09 21_38.rtf"
## [22] "ChatLog Data Analytics Course Weekday Batch  2017_02_14 21_35.rtf"
## [23] "ChatLog Data Analytics Course Weekday Batch  2017_02_16 21_34.rtf"
## [24] "ChatLog Data Analytics Course Weekday Batch  2017_02_21 21_43.rtf"
## [25] "ChatLog Data Analytics Course Weekday Batch  2017_02_23 21_54.rtf"
## [26] "ChatLog New Meeting 2017_01_18 09_28.rtf"                         
## [27] "desktop.ini"                                                      
## [28] "Fax"                                                              
## [29] "GermanCredit.csv"                                                 
## [30] "GitHub"                                                           
## [31] "mtcars.csv"                                                       
## [32] "my first notebook.ipynb"                                          
## [33] "My Music"                                                         
## [34] "My Pictures"                                                      
## [35] "my second notebook.ipynb"                                         
## [36] "My Tableau Repository"                                            
## [37] "My Videos"                                                        
## [38] "notebook-Copy1.ipynb"                                             
## [39] "notebook.ipynb"                                                   
## [40] "original"                                                         
## [41] "Python Scripts"                                                   
## [42] "R"                                                                
## [43] "rsconnect"                                                        
## [44] "SAS-University-Edition"                                           
## [45] "Scanned Documents"                                                
## [46] "sql.html"                                                         
## [47] "sql.R"                                                            
## [48] "sqlalchemy.ipynb"                                                 
## [49] "stackoverflow-dump-analysis.ipynb"                                
## [50] "Thumbs.db"                                                        
## [51] "time series.ipynb"                                                
## [52] "Untitled.ipynb"                                                   
## [53] "untitled.txt"                                                     
## [54] "Untitled1.ipynb"                                                  
## [55] "Untitled2.ipynb"                                                  
## [56] "Untitled3.ipynb"                                                  
## [57] "Untitled4.ipynb"                                                  
## [58] "Untitled5.ipynb"                                                  
## [59] "unvbasicvapp__9411006__ova__en__sp0__1.ova"

dir(pattern = "csv")

## [1] "ccFraud.csv"      "GermanCredit.csv" "mtcars.csv"

#packages

# install.packages("abc")
# 
# Installing package into ‘C:/Users/Dell/Documents/R/win-library/3.3’
# (as ‘lib’ is unspecified)
# trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.3/abc_2.1.zip'
# Content type 'application/zip' length 520806 bytes (508 KB)
# downloaded 508 KB
# 
# package ‘abc’ successfully unpacked and MD5 sums checked
# 
# The downloaded binary packages are in
# C:\Users\Dell\AppData\Local\Temp\RtmpMpknEz\downloaded_packages

##update.packages()
# > 
library("MASS")

dir(pattern = "csv")

## [1] "ccFraud.csv"      "GermanCredit.csv" "mtcars.csv"

#data input

library(data.table)
library(readr)
#install.packages("microbenchmark")
system.time(read.csv("ccFraud.csv")) #base R

##    user  system elapsed 
##   78.62    1.71   81.15

system.time(read_csv("ccFraud.csv")) #read R

## Parsed with column specification:
## cols(
##   custID = col_integer(),
##   gender = col_integer(),
##   state = col_integer(),
##   cardholder = col_integer(),
##   balance = col_integer(),
##   numTrans = col_integer(),
##   numIntlTrans = col_integer(),
##   creditLine = col_integer(),
##   fraudRisk = col_integer()
## )

##    user  system elapsed 
##    8.87    0.17    9.10

system.time(fread("ccFraud.csv")) #data.table

## 
Read 22.4% of 10000000 rows
Read 38.2% of 10000000 rows
Read 54.0% of 10000000 rows
Read 69.6% of 10000000 rows
Read 84.8% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:08

##    user  system elapsed 
##    6.91    0.06    7.05

#http://adv-r.had.co.nz/Profiling.html
library(microbenchmark)
microbenchmark(
  fread("ccFraud.csv"),
  read_csv("ccFraud.csv"),
  read.csv("ccFraud.csv"),times=2)

## 
Read 15.2% of 10000000 rows
Read 30.1% of 10000000 rows
Read 45.2% of 10000000 rows
Read 58.0% of 10000000 rows
Read 69.0% of 10000000 rows
Read 79.6% of 10000000 rows
Read 90.1% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:09

## Parsed with column specification:
## cols(
##   custID = col_integer(),
##   gender = col_integer(),
##   state = col_integer(),
##   cardholder = col_integer(),
##   balance = col_integer(),
##   numTrans = col_integer(),
##   numIntlTrans = col_integer(),
##   creditLine = col_integer(),
##   fraudRisk = col_integer()
## )
## Parsed with column specification:
## cols(
##   custID = col_integer(),
##   gender = col_integer(),
##   state = col_integer(),
##   cardholder = col_integer(),
##   balance = col_integer(),
##   numTrans = col_integer(),
##   numIntlTrans = col_integer(),
##   creditLine = col_integer(),
##   fraudRisk = col_integer()
## )

## 
Read 20.3% of 10000000 rows
Read 35.1% of 10000000 rows
Read 50.4% of 10000000 rows
Read 65.6% of 10000000 rows
Read 80.8% of 10000000 rows
Read 92.3% of 10000000 rows
Read 10000000 rows and 9 (of 9) columns from 0.272 GB file in 00:00:08

## Unit: seconds
##                     expr       min        lq      mean    median        uq
##     fread("ccFraud.csv")  7.867825  7.867825  8.415806  8.415806  8.963787
##  read_csv("ccFraud.csv")  9.363628  9.363628 10.176424 10.176424 10.989221
##  read.csv("ccFraud.csv") 76.465238 76.465238 78.071802 78.071802 79.678367
##        max neval cld
##   8.963787     2  a 
##  10.989221     2  a 
##  79.678367     2   b

# 
# #profiling
# #install.packages("lineprof")
# devtools::install_github("hadley/lineprof")
# 
# library(lineprof)
# f <- function() {
#   pause(0.1)
#   g()
#   h()
# }
# g <- function() {
#   pause(0.1)
#   h()
# }
# h <- function() {
#   pause(0.1)
# }
# 
# l <- lineprof(f())
# l
# 
# library(shiny)
# shine(l)

#RODBC

library(RODBC) #loading the package RODBC 
odbcDataSources() # to check all available ODBC data sources

##                                              dBASE Files 
##    "Microsoft Access dBASE Driver (*.dbf, *.ndx, *.mdx)" 
##                                              Excel Files 
## "Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)" 
##                                       MS Access Database 
##               "Microsoft Access Driver (*.mdb, *.accdb)" 
##                                             PostgreSQL30 
##                           "PostgreSQL ODBC Driver(ANSI)"

#creating a Database connection
# for username,password,database name and DSN name

chan=odbcConnect("PostgreSQL30","postgres;Password=root;Database=ajay")
#to list all table names
sqlTables(chan)

##   TABLE_CAT TABLE_SCHEM TABLE_NAME TABLE_TYPE REMARKS
## 1      ajay      public      sales      TABLE

#and fetch some data
sqlFetch(chan,"sales",max=10)

##    customer_id sales       date product_id
## 1        10001  5230 2017-02-07        524
## 2        10002  2781 2017-05-12        469
## 3        10003  2083 2016-12-18        917
## 4        10004   214 2015-01-19        354
## 5        10005  9407 2016-09-26        292
## 6        10006  4705 2015-10-17        380
## 7        10007  4729 2016-01-02        469
## 8        10008  7715 2015-09-12        480
## 9        10009  9898 2015-04-05        611
## 10       10010  5797 2015-08-13        959

newsales=sqlFetch(chan,"sales")
library(sqldf)

## Loading required package: gsubfn

## Loading required package: proto

## Loading required package: RSQLite

prodfreq=sqldf("select count(product_id) as ct,product_id from newsales group by product_id")

## Loading required package: tcltk

## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.

names(prodfreq)

## [1] "ct"         "product_id"

prodfreq2=prodfreq[order(-prodfreq$ct),] 
head(prodfreq2)

##     ct product_id
## 67   4        266
## 118  4        394
## 224  4        667
## 284  4        807
## 19   3        147
## 36   3        195

#missing values
testdata=c(NA,NA,46,78,98)
mean(testdata)

## [1] NA

table(is.na(testdata))

## 
## FALSE  TRUE 
##     3     2

mean(testdata,na.rm = T)

## [1] 74

testdata

## [1] NA NA 46 78 98

testdata2=na.omit(testdata)
testdata2

## [1] 46 78 98
## attr(,"na.action")
## [1] 1 2
## attr(,"class")
## [1] "omit"

#for loop

for (i in 1:10){print(i)}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10

simplefun=function(x){x^4}
simplefun(10)

## [1] 10000

for (i in 1:10){print(simplefun(i))}

## [1] 1
## [1] 16
## [1] 81
## [1] 256
## [1] 625
## [1] 1296
## [1] 2401
## [1] 4096
## [1] 6561
## [1] 10000

#lapply
summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

lapply(iris,summary)

## $Sepal.Length
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.300   5.100   5.800   5.843   6.400   7.900 
## 
## $Sepal.Width
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.800   3.000   3.057   3.300   4.400 
## 
## $Petal.Length
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.600   4.350   3.758   5.100   6.900 
## 
## $Petal.Width
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.100   0.300   1.300   1.199   1.800   2.500 
## 
## $Species
##     setosa versicolor  virginica 
##         50         50         50

library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

describe(iris)

## iris 
## 
##  5  Variables      150  Observations
## ---------------------------------------------------------------------------
## Sepal.Length 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       35    0.998    5.843   0.9462    4.600    4.800 
##      .25      .50      .75      .90      .95 
##    5.100    5.800    6.400    6.900    7.255 
## 
## lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
## ---------------------------------------------------------------------------
## Sepal.Width 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       23    0.992    3.057   0.4872    2.345    2.500 
##      .25      .50      .75      .90      .95 
##    2.800    3.000    3.300    3.610    3.800 
## 
## lowest : 2.0 2.2 2.3 2.4 2.5, highest: 3.9 4.0 4.1 4.2 4.4
## ---------------------------------------------------------------------------
## Petal.Length 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       43    0.998    3.758    1.979     1.30     1.40 
##      .25      .50      .75      .90      .95 
##     1.60     4.35     5.10     5.80     6.10 
## 
## lowest : 1.0 1.1 1.2 1.3 1.4, highest: 6.3 6.4 6.6 6.7 6.9
## ---------------------------------------------------------------------------
## Petal.Width 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       22     0.99    1.199   0.8676      0.2      0.2 
##      .25      .50      .75      .90      .95 
##      0.3      1.3      1.8      2.2      2.3 
## 
## lowest : 0.1 0.2 0.3 0.4 0.5, highest: 2.1 2.2 2.3 2.4 2.5
## ---------------------------------------------------------------------------
## Species 
##        n  missing distinct 
##      150        0        3 
##                                            
## Value          setosa versicolor  virginica
## Frequency          50         50         50
## Proportion      0.333      0.333      0.333
## ---------------------------------------------------------------------------

lapply(iris,describe)

## $Sepal.Length
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       35    0.998    5.843   0.9462    4.600    4.800 
##      .25      .50      .75      .90      .95 
##    5.100    5.800    6.400    6.900    7.255 
## 
## lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
## 
## $Sepal.Width
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       23    0.992    3.057   0.4872    2.345    2.500 
##      .25      .50      .75      .90      .95 
##    2.800    3.000    3.300    3.610    3.800 
## 
## lowest : 2.0 2.2 2.3 2.4 2.5, highest: 3.9 4.0 4.1 4.2 4.4
## 
## $Petal.Length
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       43    0.998    3.758    1.979     1.30     1.40 
##      .25      .50      .75      .90      .95 
##     1.60     4.35     5.10     5.80     6.10 
## 
## lowest : 1.0 1.1 1.2 1.3 1.4, highest: 6.3 6.4 6.6 6.7 6.9
## 
## $Petal.Width
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       22     0.99    1.199   0.8676      0.2      0.2 
##      .25      .50      .75      .90      .95 
##      0.3      1.3      1.8      2.2      2.3 
## 
## lowest : 0.1 0.2 0.3 0.4 0.5, highest: 2.1 2.2 2.3 2.4 2.5
## 
## $Species
## X[[i]] 
##        n  missing distinct 
##      150        0        3 
##                                            
## Value          setosa versicolor  virginica
## Frequency          50         50         50
## Proportion      0.333      0.333      0.333

summarize(iris$Sepal.Length,iris$Species,summary)

##   iris$Species iris.Sepal.Length X1st.Qu. Median  Mean X3rd.Qu. Max.
## 1       setosa               4.3    4.800    5.0 5.006      5.2  5.8
## 2   versicolor               4.9    5.600    5.9 5.936      6.3  7.0
## 3    virginica               4.9    6.225    6.5 6.588      6.9  7.9

revision.R

Dell

Sat Feb 25 09:46:06 2017