money=c("$50000","$50,000","50,000",50000,"50000",NA)
# Give me average money
mean(money,na.rm=T)
## Warning in mean.default(money, na.rm = T): argument is not numeric or
## logical: returning NA
## [1] NA
money=gsub(",","",money)
money=gsub("\\$","",money)
money=as.numeric(money)
mean(money,na.rm=T)
## [1] 50000
na.omit(money)
## [1] 50000 50000 50000 50000 50000
## attr(,"na.action")
## [1] 6
## attr(,"class")
## [1] "omit"
library(readr)
money=c("$50000 ","$50,000 ","50,000 ",50000,"50000 ",NA)
money2=gsub(" ","",money)
money
## [1] "$50000 " "$50,000 " "50,000 " "50000" "50000 "
## [6] NA
dates=c("1april1987","7-4-1985","15 MAY 1985","30/11/1984","30June1989")
library(lubridate)
dates2=dmy(dates)# conversion of date in standard format
Sys.Date() #todays date
## [1] "2015-09-19"
b=dmy("1 April 2015")
age1=difftime(Sys.Date(),dates2) #difference in time
age2=difftime(b,dates2) #difference in time
names=c("Ajay","Sri DHAR","TUSHIT","krishna "," james P")
names2=tolower(gsub(" ","",names))
names
## [1] "Ajay" "Sri DHAR" "TUSHIT" "krishna " " james P"
nchar(names2)
## [1] 4 7 6 7 6
substr(names2,2,4)
## [1] "jay" "rid" "ush" "ris" "ame"
names
## [1] "Ajay" "Sri DHAR" "TUSHIT" "krishna " " james P"
names3=strsplit(names," ")
names3
## [[1]]
## [1] "Ajay"
##
## [[2]]
## [1] "Sri" "DHAR"
##
## [[3]]
## [1] "TUSHIT"
##
## [[4]]
## [1] "krishna"
##
## [[5]]
## [1] "" "james" "P"
#data from http://bit.ly/datestdata
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, mday, month, quarter, wday, week, yday, year
test=fread("test.csv")
head(test,50)
## Hour Index Sessions
## 1: 0 26
## 2: 1 16
## 3: 2 26
## 4: 3 14
## 5: 4 10
## 6: 5 16
## 7: 6 9
## 8: 7 17
## 9: 8 20
## 10: 9 9
## 11: 10 19
## 12: 11 10
## 13: 12 17
## 14: 13 23
## 15: 14 10
## 16: 15 8
## 17: 16 10
## 18: 17 7
## 19: 18 3
## 20: 19 6
## 21: 20 10
## 22: 21 12
## 23: 22 8
## 24: 23 3
## 25: 24 10
## 26: 25 9
## 27: 26 10
## 28: 27 8
## 29: 28 9
## 30: 29 7
## 31: 30 12
## 32: 31 11
## 33: 32 6
## 34: 33 3
## 35: 34 10
## 36: 35 16
## 37: 36 8
## 38: 37 10
## 39: 38 11
## 40: 39 11
## 41: 40 8
## 42: 41 5
## 43: 42 8
## 44: 43 5
## 45: 44 7
## 46: 45 8
## 47: 46 10
## 48: 47 3
## 49: 48 7
## 50: 49 8
## Hour Index Sessions
str(test)
## Classes 'data.table' and 'data.frame': 9529 obs. of 2 variables:
## $ Hour Index: int 0 1 2 3 4 5 6 7 8 9 ...
## $ Sessions : chr "26" "16" "26" "14" ...
## - attr(*, ".internal.selfref")=<externalptr>
# convert all the data into numeric
test$Sessions=as.numeric(test$Sessions)
## Warning: NAs introduced by coercion
summary(test)
## Hour Index Sessions
## Min. : 0 Min. : 0.00
## 1st Qu.:2382 1st Qu.: 10.00
## Median :4764 Median : 15.00
## Mean :4764 Mean : 16.71
## 3rd Qu.:7145 3rd Qu.: 21.00
## Max. :9527 Max. :215.00
## NA's :1 NA's :1
# delete the last row using R code in this data set
test=na.omit(test)
plot(test$`Hour Index`,test$Sessions,type="l")

# for excel files
#library(readxl)
# read_excel() for excel files
getwd()
## [1] "C:/Users/dell/Desktop"
library(readxl)
testajay=read_excel("C:/Users/dell/Downloads/test.xlsx")
str(testajay)
## Classes 'tbl_df', 'tbl' and 'data.frame': 9529 obs. of 2 variables:
## $ Hour Index: num 0 1 2 3 4 5 6 7 8 9 ...
## $ Sessions : num 26 16 26 14 10 16 9 17 20 9 ...
tail(testajay)
## Hour Index Sessions
## 9524 9523 10
## 9525 9524 11
## 9526 9525 6
## 9527 9526 16
## 9528 9527 10
## 9529 NA 159259
testajay[9529,]=NA
plot(testajay[,1],testajay[,2],type="l")

tail(test,10)
## Hour Index Sessions
## 1: 9518 14
## 2: 9519 15
## 3: 9520 13
## 4: 9521 8
## 5: 9522 7
## 6: 9523 10
## 7: 9524 11
## 8: 9525 6
## 9: 9526 16
## 10: 9527 10
test$Views=NULL
## Warning in `[<-.data.table`(x, j = name, value = value): Adding new column
## 'Views' then assigning NULL (deleting it).
test$Views=cumsum(test$Sessions)
test$DiffViews=diff(test$Sessions)
## Warning in `[<-.data.table`(x, j = name, value = value): Supplied 9527
## items to be assigned to 9528 items of column 'DiffViews' (recycled leaving
## remainder of 1 items).
test$DiffDayViews=diff(test$Sessions,24)
## Warning in `[<-.data.table`(x, j = name, value = value): Supplied 9504
## items to be assigned to 9528 items of column 'DiffDayViews' (recycled
## leaving remainder of 24 items).
test$DiffWeekViews=diff(test$Sessions,168)
## Warning in `[<-.data.table`(x, j = name, value = value): Supplied 9360
## items to be assigned to 9528 items of column 'DiffWeekViews' (recycled
## leaving remainder of 168 items).
summary(test)
## Hour Index Sessions Views DiffViews
## Min. : 0 Min. : 0.00 Min. : 26 Min. :-78.00000
## 1st Qu.:2382 1st Qu.: 10.00 1st Qu.: 44996 1st Qu.: -4.00000
## Median :4764 Median : 15.00 Median : 81736 Median : 0.00000
## Mean :4764 Mean : 16.71 Mean : 83165 Mean : -0.00273
## 3rd Qu.:7145 3rd Qu.: 21.00 3rd Qu.:125550 3rd Qu.: 4.00000
## Max. :9527 Max. :215.00 Max. :159259 Max. :120.00000
## DiffDayViews DiffWeekViews
## Min. :-1.74e+02 Min. :-199.00000
## 1st Qu.:-5.00e+00 1st Qu.: -5.00000
## Median : 0.00e+00 Median : 0.00000
## Mean :-8.08e-03 Mean : 0.01637
## 3rd Qu.: 5.00e+00 3rd Qu.: 4.00000
## Max. : 1.29e+02 Max. : 204.00000
par(mfrow=c(2,3))
plot(test$Sessions,type="l")
plot(test$Views,type="l")
plot(test$DiffViews,type="l")
plot(test$DiffDayViews,type="l")
plot(test$DiffWeekViews,type="l")
test$Views=NULL
bijay= sample(1:100,10,F)
bijay
## [1] 29 13 70 16 40 24 28 8 41 34
cumsum(bijay)
## [1] 29 42 112 128 168 192 220 228 269 303
cummax(bijay)
## [1] 29 29 70 70 70 70 70 70 70 70
bijay
## [1] 29 13 70 16 40 24 28 8 41 34
diff(bijay,3)
## [1] -13 27 -46 12 -32 17 6
