dataq.R

money=c("$50000","$50,000","50,000",50000,"50000",NA)
# Give me average money
mean(money,na.rm=T)

## Warning in mean.default(money, na.rm = T): argument is not numeric or
## logical: returning NA

## [1] NA

money=gsub(",","",money)
money=gsub("\\$","",money)
money=as.numeric(money)
mean(money,na.rm=T)

## [1] 50000

na.omit(money)

## [1] 50000 50000 50000 50000 50000
## attr(,"na.action")
## [1] 6
## attr(,"class")
## [1] "omit"

library(readr)
money=c("$50000   ","$50,000   ","50,000   ",50000,"50000  ",NA)
money2=gsub(" ","",money)
money

## [1] "$50000   "  "$50,000   " "50,000   "  "50000"      "50000  "   
## [6] NA

dates=c("1april1987","7-4-1985","15 MAY 1985","30/11/1984","30June1989")
library(lubridate)
dates2=dmy(dates)# conversion of date in standard format
Sys.Date() #todays date

## [1] "2015-09-19"

b=dmy("1 April 2015")
age1=difftime(Sys.Date(),dates2) #difference in time
age2=difftime(b,dates2) #difference in time

names=c("Ajay","Sri DHAR","TUSHIT","krishna "," james P")

names2=tolower(gsub(" ","",names))
names

## [1] "Ajay"     "Sri DHAR" "TUSHIT"   "krishna " " james P"

nchar(names2)

## [1] 4 7 6 7 6

substr(names2,2,4)

## [1] "jay" "rid" "ush" "ris" "ame"

names

## [1] "Ajay"     "Sri DHAR" "TUSHIT"   "krishna " " james P"

names3=strsplit(names," ")
names3

## [[1]]
## [1] "Ajay"
## 
## [[2]]
## [1] "Sri"  "DHAR"
## 
## [[3]]
## [1] "TUSHIT"
## 
## [[4]]
## [1] "krishna"
## 
## [[5]]
## [1] ""      "james" "P"

#data from  http://bit.ly/datestdata
library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, mday, month, quarter, wday, week, yday, year

test=fread("test.csv")
head(test,50)

##     Hour Index Sessions
##  1:          0       26
##  2:          1       16
##  3:          2       26
##  4:          3       14
##  5:          4       10
##  6:          5       16
##  7:          6        9
##  8:          7       17
##  9:          8       20
## 10:          9        9
## 11:         10       19
## 12:         11       10
## 13:         12       17
## 14:         13       23
## 15:         14       10
## 16:         15        8
## 17:         16       10
## 18:         17        7
## 19:         18        3
## 20:         19        6
## 21:         20       10
## 22:         21       12
## 23:         22        8
## 24:         23        3
## 25:         24       10
## 26:         25        9
## 27:         26       10
## 28:         27        8
## 29:         28        9
## 30:         29        7
## 31:         30       12
## 32:         31       11
## 33:         32        6
## 34:         33        3
## 35:         34       10
## 36:         35       16
## 37:         36        8
## 38:         37       10
## 39:         38       11
## 40:         39       11
## 41:         40        8
## 42:         41        5
## 43:         42        8
## 44:         43        5
## 45:         44        7
## 46:         45        8
## 47:         46       10
## 48:         47        3
## 49:         48        7
## 50:         49        8
##     Hour Index Sessions

str(test)

## Classes 'data.table' and 'data.frame':   9529 obs. of  2 variables:
##  $ Hour Index: int  0 1 2 3 4 5 6 7 8 9 ...
##  $ Sessions  : chr  "26" "16" "26" "14" ...
##  - attr(*, ".internal.selfref")=<externalptr>

# convert all the data into numeric
test$Sessions=as.numeric(test$Sessions)

## Warning: NAs introduced by coercion

summary(test)

##    Hour Index      Sessions     
##  Min.   :   0   Min.   :  0.00  
##  1st Qu.:2382   1st Qu.: 10.00  
##  Median :4764   Median : 15.00  
##  Mean   :4764   Mean   : 16.71  
##  3rd Qu.:7145   3rd Qu.: 21.00  
##  Max.   :9527   Max.   :215.00  
##  NA's   :1      NA's   :1

# delete the last row using R code in this data set

test=na.omit(test)

plot(test$`Hour Index`,test$Sessions,type="l")

# for excel files
#library(readxl)

# read_excel() for excel files
getwd()

## [1] "C:/Users/dell/Desktop"

library(readxl)
testajay=read_excel("C:/Users/dell/Downloads/test.xlsx")
str(testajay)

## Classes 'tbl_df', 'tbl' and 'data.frame':    9529 obs. of  2 variables:
##  $ Hour Index: num  0 1 2 3 4 5 6 7 8 9 ...
##  $ Sessions  : num  26 16 26 14 10 16 9 17 20 9 ...

tail(testajay)

##      Hour Index Sessions
## 9524       9523       10
## 9525       9524       11
## 9526       9525        6
## 9527       9526       16
## 9528       9527       10
## 9529         NA   159259

testajay[9529,]=NA

plot(testajay[,1],testajay[,2],type="l")

tail(test,10)

##     Hour Index Sessions
##  1:       9518       14
##  2:       9519       15
##  3:       9520       13
##  4:       9521        8
##  5:       9522        7
##  6:       9523       10
##  7:       9524       11
##  8:       9525        6
##  9:       9526       16
## 10:       9527       10

test$Views=NULL

## Warning in `[<-.data.table`(x, j = name, value = value): Adding new column
## 'Views' then assigning NULL (deleting it).

test$Views=cumsum(test$Sessions)
test$DiffViews=diff(test$Sessions)

## Warning in `[<-.data.table`(x, j = name, value = value): Supplied 9527
## items to be assigned to 9528 items of column 'DiffViews' (recycled leaving
## remainder of 1 items).

test$DiffDayViews=diff(test$Sessions,24)

## Warning in `[<-.data.table`(x, j = name, value = value): Supplied 9504
## items to be assigned to 9528 items of column 'DiffDayViews' (recycled
## leaving remainder of 24 items).

test$DiffWeekViews=diff(test$Sessions,168)

## Warning in `[<-.data.table`(x, j = name, value = value): Supplied 9360
## items to be assigned to 9528 items of column 'DiffWeekViews' (recycled
## leaving remainder of 168 items).

summary(test)

##    Hour Index      Sessions          Views          DiffViews        
##  Min.   :   0   Min.   :  0.00   Min.   :    26   Min.   :-78.00000  
##  1st Qu.:2382   1st Qu.: 10.00   1st Qu.: 44996   1st Qu.: -4.00000  
##  Median :4764   Median : 15.00   Median : 81736   Median :  0.00000  
##  Mean   :4764   Mean   : 16.71   Mean   : 83165   Mean   : -0.00273  
##  3rd Qu.:7145   3rd Qu.: 21.00   3rd Qu.:125550   3rd Qu.:  4.00000  
##  Max.   :9527   Max.   :215.00   Max.   :159259   Max.   :120.00000  
##   DiffDayViews       DiffWeekViews       
##  Min.   :-1.74e+02   Min.   :-199.00000  
##  1st Qu.:-5.00e+00   1st Qu.:  -5.00000  
##  Median : 0.00e+00   Median :   0.00000  
##  Mean   :-8.08e-03   Mean   :   0.01637  
##  3rd Qu.: 5.00e+00   3rd Qu.:   4.00000  
##  Max.   : 1.29e+02   Max.   : 204.00000

par(mfrow=c(2,3))
plot(test$Sessions,type="l")

plot(test$Views,type="l")
plot(test$DiffViews,type="l")
plot(test$DiffDayViews,type="l")
plot(test$DiffWeekViews,type="l")

test$Views=NULL 

bijay= sample(1:100,10,F)
bijay

##  [1] 29 13 70 16 40 24 28  8 41 34

cumsum(bijay)

##  [1]  29  42 112 128 168 192 220 228 269 303

cummax(bijay)

##  [1] 29 29 70 70 70 70 70 70 70 70

bijay

##  [1] 29 13 70 16 40 24 28  8 41 34

diff(bijay,3)

## [1] -13  27 -46  12 -32  17   6

dataq.R

dell

Sat Sep 19 19:28:42 2015