#Calculations is extremely simple with R. Its likes typing in a calculator
#addition in R
5+6
## [1] 11
#Subtraction in R
10-7
## [1] 3
#Multiplication in R
10*2
## [1] 20
#Division in R
20/5
## [1] 4
#sqrt is used to find Square Root of a number. The number is written in bracket i.e.
sqrt(49)
## [1] 7
# ^ is for power in R. Number after ^ is power i.e.
5^3 #5 power 3
## [1] 125
# ^ can also be used for root
27^1/3 #cube root
## [1] 9
#exp is used for Exponential in R
exp(2)
## [1] 7.389056
#Log is used for Logarithm in R. You can assign base as well.
log(10, base= 10)
## [1] 1
#Finding factorial in R is made simple by
factorial(5)
## [1] 120
#Permutation and combinations is made easy through choose command in R
choose(5,2) # n!/(x!(n-x)!)
## [1] 10
#Floor is used to return largest integer that is smaller than or equal to value passed in the argument
floor(5.4)
## [1] 5
floor(5.8)
## [1] 5
#Ceiling is used to return the smallest integer that is greater than or equal to the value passed in the argument
ceiling(5.4)
## [1] 6
ceiling(5.8)
## [1] 6
#Round is used to round up numbers after decimal to nearest number
round(5.783, digits = 0)
## [1] 6
round(5.783, digits = 1)
## [1] 5.8
round(5.786, digits = 2)
## [1] 5.79
# Trigonometry is fairly simple too
cos(0)
## [1] 1
sin(pi/2)
## [1] 1
#Assigning variable in R
Ashwani <- 50
#variable is case sensitive. Hence
#ashwani #gives an error because 50 was assigned to "Ashwani" with A cap
Ashwani
## [1] 50
#Real Numbers in R
a <- 5
is.numeric(a)
## [1] TRUE
#Complex numbers in R
b <- 7+3i
is.complex(b)
## [1] TRUE
is.numeric(b)#This will give False as b is not numeric number
## [1] FALSE
#Factors in R are categorical variables that have a fixed number of levels.
gender_class <- factor(c("Male", "Female","Male", "Transgender","Female"))
class(gender_class)#gives the type of variable
## [1] "factor"
levels(gender_class)#gives the unique categorical variables
## [1] "Female" "Male" "Transgender"
nlevels(gender_class) #gives number of unique categorical variables
## [1] 3
#Greater than Operator
5>2
## [1] TRUE
#Less Than Operator
5<2
## [1] FALSE
#Greater than equal
3>=2
## [1] TRUE
#Logical Equals operator
TRUE==FALSE
## [1] FALSE
#Logical Not Equal Operator
TRUE!=FALSE
## [1] TRUE
#Generating sequence in R
1:8
## [1] 1 2 3 4 5 6 7 8
seq(1,8)
## [1] 1 2 3 4 5 6 7 8
seq(0,1.8,0.2) #Sequence with difference of 0.2
## [1] 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8
#Generate Repeated items in R
rep(1,5) #rep(x=1,times=5)
## [1] 1 1 1 1 1
rep(seq(1,5), 2) #rep(x=seq(1:5),times=2)
## [1] 1 2 3 4 5 1 2 3 4 5
rep(1:4,each=2,3) #rep(1:4, each = 2, times = 3)
## [1] 1 1 2 2 3 3 4 4 1 1 2 2 3 3 4 4 1 1 2 2 3 3 4 4
#Finding Missing Value
x <- c(4,NA,7)
is.na(x)
## [1] FALSE TRUE FALSE
x[!is.na(x)] #removing na from vector
## [1] 4 7
#Sorting of vectors in R
z <- c(8,3,5,7,6,6,8,9,2,3,9,4,10,4,11)
sort(z)
## [1] 2 3 3 4 4 5 6 6 7 8 8 9 9 10 11
rev(sort(z)) #Reverse sort
## [1] 11 10 9 9 8 8 7 6 6 5 4 4 3 3 2
rev(sort(z))[1:5] #Reverse sort only first 5 times
## [1] 11 10 9 9 8
#Range of vectors R
range(z)
## [1] 2 11
#Functions on vector
which(z>6) #gives items greater than 6 in the vector
## [1] 1 4 7 8 11 13 15
length(z) #gives length of z
## [1] 15
person_names <- c("Ashwani","Ashu","Smith","Ashwani","Smith","Ashu")
unique(person_names) #gives the list of unique items in the vector
## [1] "Ashwani" "Ashu" "Smith"
duplicated(person_names) #gives duplicated items in the vector
## [1] FALSE FALSE FALSE TRUE TRUE TRUE
x <- matrix(c(1,2,3,4,1,3,4,5,1),nrow=3)
x
## [,1] [,2] [,3]
## [1,] 1 4 4
## [2,] 2 1 5
## [3,] 3 3 1
#extract element of matrix
x[2,1] #gives element from 2nd row and 1st column
## [1] 2
#mean of row or column
mean(x[,2]) #gives mean of the 2nd column
## [1] 2.666667
colMeans(x) #gives mean of columns
## [1] 2.000000 2.666667 3.333333
rowMeans(x) #gives mean of rows
## [1] 3.000000 2.666667 2.333333
#sum of row or column
sum(x[,2]) #gives mean of the 2nd column
## [1] 8
colSums(x) #gives mean of columns
## [1] 6 8 10
rowSums(x) #gives mean of rows
## [1] 9 8 7
#Match Operations on Matrices
m1 <- matrix(seq(1,30,2),nrow=5)
m1
## [,1] [,2] [,3]
## [1,] 1 11 21
## [2,] 3 13 23
## [3,] 5 15 25
## [4,] 7 17 27
## [5,] 9 19 29
m2 <- matrix(2:16,nrow=5)
m2
## [,1] [,2] [,3]
## [1,] 2 7 12
## [2,] 3 8 13
## [3,] 4 9 14
## [4,] 5 10 15
## [5,] 6 11 16
#Addition of Matrices
m1+m2
## [,1] [,2] [,3]
## [1,] 3 18 33
## [2,] 6 21 36
## [3,] 9 24 39
## [4,] 12 27 42
## [5,] 15 30 45
#Subtraction in Matrices
m2-m1
## [,1] [,2] [,3]
## [1,] 1 -4 -9
## [2,] 0 -5 -10
## [3,] -1 -6 -11
## [4,] -2 -7 -12
## [5,] -3 -8 -13
#multiplication in Matrices
m1*m2
## [,1] [,2] [,3]
## [1,] 2 77 252
## [2,] 9 104 299
## [3,] 20 135 350
## [4,] 35 170 405
## [5,] 54 209 464
#For Loop in R
for (i in 1:5) {
print(i+i) #add items one by one till loop complete
}
## [1] 2
## [1] 4
## [1] 6
## [1] 8
## [1] 10
#For loop on Matrix
m= matrix(1:9,nrow=3)
for (i in 1:nrow(m)) {
for (j in 1:ncol(m)) {
print(m[i,j])
}
}
## [1] 1
## [1] 4
## [1] 7
## [1] 2
## [1] 5
## [1] 8
## [1] 3
## [1] 6
## [1] 9
#While Loop in R
i=2
while (i <= 6) {
print(i*i)
i=i+1
}
## [1] 4
## [1] 9
## [1] 16
## [1] 25
## [1] 36
#if else and function in R
OddEven = function(x){
if (x %% 2 == 0)
return("The number is even")
else
return("The number is odd")
}
print(OddEven(12))
## [1] "The number is even"
text="Lorem Ipsum is simply dummy text of the printing and typesetting industry."
#Extract Substrings from specific position of the text
substr(text,1,5) #substr(text,position,number of character)
## [1] "Lorem"
#insert something in a string
substr(text,1,6) <- "Change"
#Change text to upper case
toupper(text)
## [1] "CHANGEIPSUM IS SIMPLY DUMMY TEXT OF THE PRINTING AND TYPESETTING INDUSTRY."
#Change text to lower case
tolower(text)
## [1] "changeipsum is simply dummy text of the printing and typesetting industry."
#Detailed System time in R
Sys.time()
## [1] "2022-08-20 22:21:22 IST"
#Change date & time type(POSIX system) to number
as.numeric(Sys.time())
## [1] 1661014283
#Subtraction Date & time
difftime("2021-10-21","2021-8-15")
## Time difference of 67 days
library(RCurl)#open installed packages
file1 <- read.csv("D:\\ISB 2021\\Foundational R\\Ashwani_Singh_assignment R\\Retail data set\\Transactions.csv")
class(file1)
## [1] "data.frame"
#read first 6 rows of the dataset
head(file1)
## transaction_id cust_id tran_date prod_subcat_code prod_cat_code Qty Rate
## 1 80712190438 270351 28-02-2014 1 1 -5 -772
## 2 29258453508 270384 27-02-2014 5 3 -5 -1497
## 3 51750724947 273420 24-02-2014 6 5 -2 -791
## 4 93274880719 271509 24-02-2014 11 6 -3 -1363
## 5 51750724947 273420 23-02-2014 6 5 -2 -791
## 6 97439039119 272357 23-02-2014 8 3 -2 -824
## Tax total_amt Store_type
## 1 405.300 -4265.300 e-Shop
## 2 785.925 -8270.925 e-Shop
## 3 166.110 -1748.110 TeleShop
## 4 429.345 -4518.345 e-Shop
## 5 166.110 -1748.110 TeleShop
## 6 173.040 -1821.040 TeleShop
#read last 6 rows of the dataset
tail(file1)
## transaction_id cust_id tran_date prod_subcat_code prod_cat_code Qty Rate
## 23048 30856003613 266866 25-01-2011 4 2 2 444
## 23049 94340757522 274550 25-01-2011 12 5 1 1264
## 23050 89780862956 270022 25-01-2011 4 1 1 677
## 23051 85115299378 271020 25-01-2011 2 6 4 1052
## 23052 72870271171 270911 25-01-2011 11 5 3 1142
## 23053 77960931771 271961 25-01-2011 11 5 1 447
## Tax total_amt Store_type
## 23048 93.240 981.240 TeleShop
## 23049 132.720 1396.720 e-Shop
## 23050 71.085 748.085 e-Shop
## 23051 441.840 4649.840 MBR
## 23052 359.730 3785.730 TeleShop
## 23053 46.935 493.935 TeleShop
#Information about the dataset
str(file1)
## 'data.frame': 23053 obs. of 10 variables:
## $ transaction_id : num 8.07e+10 2.93e+10 5.18e+10 9.33e+10 5.18e+10 ...
## $ cust_id : int 270351 270384 273420 271509 273420 272357 273667 271489 275108 269014 ...
## $ tran_date : chr "28-02-2014" "27-02-2014" "24-02-2014" "24-02-2014" ...
## $ prod_subcat_code: int 1 5 6 11 6 8 11 12 3 8 ...
## $ prod_cat_code : int 1 3 5 6 5 3 6 6 1 3 ...
## $ Qty : int -5 -5 -2 -3 -2 -2 -1 -1 -3 -4 ...
## $ Rate : int -772 -1497 -791 -1363 -791 -824 -1450 -1225 -908 -581 ...
## $ Tax : num 405 786 166 429 166 ...
## $ total_amt : num -4265 -8271 -1748 -4518 -1748 ...
## $ Store_type : chr "e-Shop" "e-Shop" "TeleShop" "e-Shop" ...
#Get the name of columns
colnames(file1)
## [1] "transaction_id" "cust_id" "tran_date" "prod_subcat_code"
## [5] "prod_cat_code" "Qty" "Rate" "Tax"
## [9] "total_amt" "Store_type"
#Change name of the column in R
colnames(file1)[2]="customer_Id" #changing name of 2nd column header
colnames(file1)
## [1] "transaction_id" "customer_Id" "tran_date" "prod_subcat_code"
## [5] "prod_cat_code" "Qty" "Rate" "Tax"
## [9] "total_amt" "Store_type"
#Information about data
summary(file1)
## transaction_id customer_Id tran_date prod_subcat_code
## Min. :3.269e+06 Min. :266783 Length:23053 Min. : 1.000
## 1st Qu.:2.494e+10 1st Qu.:268935 Class :character 1st Qu.: 3.000
## Median :5.009e+10 Median :270980 Mode :character Median : 5.000
## Mean :5.007e+10 Mean :271022 Mean : 6.149
## 3rd Qu.:7.533e+10 3rd Qu.:273114 3rd Qu.:10.000
## Max. :9.999e+10 Max. :275265 Max. :12.000
## prod_cat_code Qty Rate Tax
## Min. :1.000 Min. :-5.000 Min. :-1499.0 Min. : 7.35
## 1st Qu.:2.000 1st Qu.: 1.000 1st Qu.: 312.0 1st Qu.: 98.28
## Median :4.000 Median : 3.000 Median : 710.0 Median :199.08
## Mean :3.764 Mean : 2.432 Mean : 636.4 Mean :248.67
## 3rd Qu.:5.000 3rd Qu.: 4.000 3rd Qu.: 1109.0 3rd Qu.:365.71
## Max. :6.000 Max. : 5.000 Max. : 1500.0 Max. :787.50
## total_amt Store_type
## Min. :-8270.9 Length:23053
## 1st Qu.: 762.5 Class :character
## Median : 1754.7 Mode :character
## Mean : 2107.3
## 3rd Qu.: 3569.2
## Max. : 8287.5
file2 <- read.csv("D:\\ISB 2021\\Foundational R\\Ashwani_Singh_assignment R\\Retail data set\\Customer.csv")
df= merge(file2,file1, by="customer_Id")
head(df)
## customer_Id DOB Gender city_code transaction_id tran_date
## 1 266783 01-05-1974 M 4 8410316370 20-02-2013
## 2 266783 01-05-1974 M 4 98477711300 21-10-2012
## 3 266783 01-05-1974 M 4 25890929042 24-09-2011
## 4 266783 01-05-1974 M 4 16999552161 9/2/2013
## 5 266783 01-05-1974 M 4 25890929042 23-09-2011
## 6 266784 13-12-1991 F 10 36310127403 4/12/2012
## prod_subcat_code prod_cat_code Qty Rate Tax total_amt Store_type
## 1 4 1 1 869 91.245 960.245 e-Shop
## 2 4 1 3 93 29.295 308.295 TeleShop
## 3 1 2 -4 -1321 554.820 -5838.820 e-Shop
## 4 10 5 2 835 175.350 1845.350 e-Shop
## 5 1 2 4 1321 554.820 5838.820 e-Shop
## 6 4 3 2 200 42.000 442.000 Flagship store
#Finding total missing values in the dataset
sum(is.na(file1))
## [1] 0
#Finding total missing values column wise
colSums(is.na(file1))
## transaction_id customer_Id tran_date prod_subcat_code
## 0 0 0 0
## prod_cat_code Qty Rate Tax
## 0 0 0 0
## total_amt Store_type
## 0 0
#removes rows with any missing
#na.omit(file1)
#Histogram is used to understand the distribution of a single variable
hist(file1$total_amt,main="Histogram of Total amount")
#Scatter plot is used to understand relation between two variable
plot(file1$Rate, file1$total_amt, main = "Scatterplot of Rate and Total amount", xlab = "Rate", ylab="Amount")
There are many ways one can use above functions. R functions mostly compliment each other. I have tried to show it through a few examples in the above notebook.
Thank You and I hope you liked my work