Foundation Assignment R- Q1

Basic R Calculations

#Calculations is extremely simple with R. Its likes typing in a calculator

#addition in R
5+6

## [1] 11

#Subtraction in R
10-7

## [1] 3

#Multiplication in R
10*2

## [1] 20

#Division in R
20/5

## [1] 4

#sqrt is used to find Square Root of a number. The number is written in bracket i.e.
sqrt(49)

## [1] 7

# ^ is for power in R. Number after ^ is power i.e.
5^3 #5 power 3

## [1] 125

# ^ can also be used for root
27^1/3 #cube root

## [1] 9

#exp is used for Exponential in R
exp(2)

## [1] 7.389056

#Log is used for Logarithm in R. You can assign base as well.
log(10, base= 10)

## [1] 1

#Finding factorial in R is made simple by
factorial(5)

## [1] 120

#Permutation and combinations is made easy through choose command in R
choose(5,2) # n!/(x!(n-x)!)

## [1] 10

#Floor is used to return largest integer that is smaller than or equal to value passed in the argument
floor(5.4)

## [1] 5

floor(5.8)

## [1] 5

#Ceiling is used to return the smallest integer that is greater than or equal to the value passed in the argument 
ceiling(5.4)

## [1] 6

ceiling(5.8)

## [1] 6

#Round is used to round up numbers after decimal to nearest number
round(5.783, digits = 0)

## [1] 6

round(5.783, digits = 1)

## [1] 5.8

round(5.786, digits = 2)

## [1] 5.79

# Trigonometry is fairly simple too  
cos(0)

## [1] 1

sin(pi/2)

## [1] 1

#Assigning variable in R
Ashwani <- 50
#variable is case sensitive. Hence
#ashwani #gives an error because 50 was assigned to "Ashwani" with A cap
Ashwani

## [1] 50

#Real Numbers in R 
a <- 5
is.numeric(a)

## [1] TRUE

#Complex numbers in R
b <- 7+3i
is.complex(b)

## [1] TRUE

is.numeric(b)#This will give False as b is not numeric number

## [1] FALSE

#Factors in R are categorical variables that have a fixed number of levels.
gender_class <- factor(c("Male", "Female","Male", "Transgender","Female"))
class(gender_class)#gives the type of variable

## [1] "factor"

levels(gender_class)#gives the unique categorical variables

## [1] "Female"      "Male"        "Transgender"

nlevels(gender_class) #gives number of unique categorical variables

## [1] 3

Operators and Other Operations in R

#Greater than Operator
5>2

## [1] TRUE

#Less Than Operator
5<2

## [1] FALSE

#Greater than equal
3>=2

## [1] TRUE

#Logical Equals operator
TRUE==FALSE

## [1] FALSE

#Logical Not Equal Operator
TRUE!=FALSE

## [1] TRUE

#Generating sequence in R
1:8

## [1] 1 2 3 4 5 6 7 8

seq(1,8)

## [1] 1 2 3 4 5 6 7 8

seq(0,1.8,0.2) #Sequence with difference of 0.2

##  [1] 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8

#Generate Repeated items in R
rep(1,5) #rep(x=1,times=5)

## [1] 1 1 1 1 1

rep(seq(1,5), 2) #rep(x=seq(1:5),times=2)

##  [1] 1 2 3 4 5 1 2 3 4 5

rep(1:4,each=2,3) #rep(1:4, each = 2, times = 3)

##  [1] 1 1 2 2 3 3 4 4 1 1 2 2 3 3 4 4 1 1 2 2 3 3 4 4

#Finding Missing Value 
x <- c(4,NA,7)
is.na(x)

## [1] FALSE  TRUE FALSE

x[!is.na(x)] #removing na from vector

## [1] 4 7

#Sorting of vectors in R
z <- c(8,3,5,7,6,6,8,9,2,3,9,4,10,4,11)
sort(z)

##  [1]  2  3  3  4  4  5  6  6  7  8  8  9  9 10 11

rev(sort(z)) #Reverse sort

##  [1] 11 10  9  9  8  8  7  6  6  5  4  4  3  3  2

rev(sort(z))[1:5] #Reverse sort only first 5 times

## [1] 11 10  9  9  8

#Range of vectors R
range(z)

## [1]  2 11

#Functions on vector
which(z>6) #gives items greater than 6 in the vector

## [1]  1  4  7  8 11 13 15

length(z) #gives length of z

## [1] 15

person_names <- c("Ashwani","Ashu","Smith","Ashwani","Smith","Ashu")
unique(person_names) #gives the list of unique items in the vector

## [1] "Ashwani" "Ashu"    "Smith"

duplicated(person_names) #gives duplicated items in the vector

## [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE

Matrices in R

x <- matrix(c(1,2,3,4,1,3,4,5,1),nrow=3)
x

##      [,1] [,2] [,3]
## [1,]    1    4    4
## [2,]    2    1    5
## [3,]    3    3    1

#extract element of matrix
x[2,1] #gives element from 2nd row and 1st column

## [1] 2

#mean of row or column
mean(x[,2]) #gives mean of the 2nd column

## [1] 2.666667

colMeans(x) #gives mean of columns

## [1] 2.000000 2.666667 3.333333

rowMeans(x) #gives mean of rows

## [1] 3.000000 2.666667 2.333333

#sum of row or column
sum(x[,2]) #gives mean of the 2nd column

## [1] 8

colSums(x) #gives mean of columns

## [1]  6  8 10

rowSums(x) #gives mean of rows

## [1] 9 8 7

#Match Operations on Matrices
m1 <- matrix(seq(1,30,2),nrow=5)
m1

##      [,1] [,2] [,3]
## [1,]    1   11   21
## [2,]    3   13   23
## [3,]    5   15   25
## [4,]    7   17   27
## [5,]    9   19   29

m2 <- matrix(2:16,nrow=5)
m2

##      [,1] [,2] [,3]
## [1,]    2    7   12
## [2,]    3    8   13
## [3,]    4    9   14
## [4,]    5   10   15
## [5,]    6   11   16

#Addition of Matrices
m1+m2

##      [,1] [,2] [,3]
## [1,]    3   18   33
## [2,]    6   21   36
## [3,]    9   24   39
## [4,]   12   27   42
## [5,]   15   30   45

#Subtraction in Matrices
m2-m1

##      [,1] [,2] [,3]
## [1,]    1   -4   -9
## [2,]    0   -5  -10
## [3,]   -1   -6  -11
## [4,]   -2   -7  -12
## [5,]   -3   -8  -13

#multiplication in Matrices
m1*m2

##      [,1] [,2] [,3]
## [1,]    2   77  252
## [2,]    9  104  299
## [3,]   20  135  350
## [4,]   35  170  405
## [5,]   54  209  464

Loops in R

#For Loop in R
for (i in 1:5) {
  print(i+i) #add items one by one till loop complete
}

## [1] 2
## [1] 4
## [1] 6
## [1] 8
## [1] 10

#For loop on Matrix
m= matrix(1:9,nrow=3)
for (i in 1:nrow(m)) {
  for (j in 1:ncol(m)) {
    print(m[i,j])
    
  }
}

## [1] 1
## [1] 4
## [1] 7
## [1] 2
## [1] 5
## [1] 8
## [1] 3
## [1] 6
## [1] 9

#While Loop in R
i=2
while (i <= 6) {
  print(i*i)
  i=i+1
}

## [1] 4
## [1] 9
## [1] 16
## [1] 25
## [1] 36

#if else and function in R
OddEven = function(x){
  if (x %% 2 == 0)
    return("The number is even")
  else
    return("The number is odd")
}
print(OddEven(12))

## [1] "The number is even"

Operations on Strings in R

text="Lorem Ipsum is simply dummy text of the printing and typesetting industry."
#Extract Substrings from specific position of the text
substr(text,1,5) #substr(text,position,number of character)

## [1] "Lorem"

#insert something in a string
substr(text,1,6) <- "Change"

#Change text to upper case
toupper(text)

## [1] "CHANGEIPSUM IS SIMPLY DUMMY TEXT OF THE PRINTING AND TYPESETTING INDUSTRY."

#Change text to lower case
tolower(text)

## [1] "changeipsum is simply dummy text of the printing and typesetting industry."

Date & Time in R

#Detailed System time in R 
Sys.time()

## [1] "2022-08-20 22:21:22 IST"

#Change date & time type(POSIX system) to number
as.numeric(Sys.time())

## [1] 1661014283

#Subtraction Date & time
difftime("2021-10-21","2021-8-15")

## Time difference of 67 days

Read file in R

library(RCurl)#open installed packages

file1 <- read.csv("D:\\ISB 2021\\Foundational R\\Ashwani_Singh_assignment R\\Retail data set\\Transactions.csv")
class(file1)

## [1] "data.frame"

#read first 6 rows of the dataset
head(file1)

##   transaction_id cust_id  tran_date prod_subcat_code prod_cat_code Qty  Rate
## 1    80712190438  270351 28-02-2014                1             1  -5  -772
## 2    29258453508  270384 27-02-2014                5             3  -5 -1497
## 3    51750724947  273420 24-02-2014                6             5  -2  -791
## 4    93274880719  271509 24-02-2014               11             6  -3 -1363
## 5    51750724947  273420 23-02-2014                6             5  -2  -791
## 6    97439039119  272357 23-02-2014                8             3  -2  -824
##       Tax total_amt Store_type
## 1 405.300 -4265.300     e-Shop
## 2 785.925 -8270.925     e-Shop
## 3 166.110 -1748.110   TeleShop
## 4 429.345 -4518.345     e-Shop
## 5 166.110 -1748.110   TeleShop
## 6 173.040 -1821.040   TeleShop

#read last 6 rows of the dataset
tail(file1)

##       transaction_id cust_id  tran_date prod_subcat_code prod_cat_code Qty Rate
## 23048    30856003613  266866 25-01-2011                4             2   2  444
## 23049    94340757522  274550 25-01-2011               12             5   1 1264
## 23050    89780862956  270022 25-01-2011                4             1   1  677
## 23051    85115299378  271020 25-01-2011                2             6   4 1052
## 23052    72870271171  270911 25-01-2011               11             5   3 1142
## 23053    77960931771  271961 25-01-2011               11             5   1  447
##           Tax total_amt Store_type
## 23048  93.240   981.240   TeleShop
## 23049 132.720  1396.720     e-Shop
## 23050  71.085   748.085     e-Shop
## 23051 441.840  4649.840        MBR
## 23052 359.730  3785.730   TeleShop
## 23053  46.935   493.935   TeleShop

#Information about the dataset
str(file1)

## 'data.frame':    23053 obs. of  10 variables:
##  $ transaction_id  : num  8.07e+10 2.93e+10 5.18e+10 9.33e+10 5.18e+10 ...
##  $ cust_id         : int  270351 270384 273420 271509 273420 272357 273667 271489 275108 269014 ...
##  $ tran_date       : chr  "28-02-2014" "27-02-2014" "24-02-2014" "24-02-2014" ...
##  $ prod_subcat_code: int  1 5 6 11 6 8 11 12 3 8 ...
##  $ prod_cat_code   : int  1 3 5 6 5 3 6 6 1 3 ...
##  $ Qty             : int  -5 -5 -2 -3 -2 -2 -1 -1 -3 -4 ...
##  $ Rate            : int  -772 -1497 -791 -1363 -791 -824 -1450 -1225 -908 -581 ...
##  $ Tax             : num  405 786 166 429 166 ...
##  $ total_amt       : num  -4265 -8271 -1748 -4518 -1748 ...
##  $ Store_type      : chr  "e-Shop" "e-Shop" "TeleShop" "e-Shop" ...

Basic EDA techniques

#Get the name of columns
colnames(file1)

##  [1] "transaction_id"   "cust_id"          "tran_date"        "prod_subcat_code"
##  [5] "prod_cat_code"    "Qty"              "Rate"             "Tax"             
##  [9] "total_amt"        "Store_type"

#Change name of the column in R

colnames(file1)[2]="customer_Id" #changing name of 2nd column header
colnames(file1)

##  [1] "transaction_id"   "customer_Id"      "tran_date"        "prod_subcat_code"
##  [5] "prod_cat_code"    "Qty"              "Rate"             "Tax"             
##  [9] "total_amt"        "Store_type"

#Information about data  
summary(file1)

##  transaction_id       customer_Id      tran_date         prod_subcat_code
##  Min.   :3.269e+06   Min.   :266783   Length:23053       Min.   : 1.000  
##  1st Qu.:2.494e+10   1st Qu.:268935   Class :character   1st Qu.: 3.000  
##  Median :5.009e+10   Median :270980   Mode  :character   Median : 5.000  
##  Mean   :5.007e+10   Mean   :271022                      Mean   : 6.149  
##  3rd Qu.:7.533e+10   3rd Qu.:273114                      3rd Qu.:10.000  
##  Max.   :9.999e+10   Max.   :275265                      Max.   :12.000  
##  prod_cat_code        Qty              Rate              Tax        
##  Min.   :1.000   Min.   :-5.000   Min.   :-1499.0   Min.   :  7.35  
##  1st Qu.:2.000   1st Qu.: 1.000   1st Qu.:  312.0   1st Qu.: 98.28  
##  Median :4.000   Median : 3.000   Median :  710.0   Median :199.08  
##  Mean   :3.764   Mean   : 2.432   Mean   :  636.4   Mean   :248.67  
##  3rd Qu.:5.000   3rd Qu.: 4.000   3rd Qu.: 1109.0   3rd Qu.:365.71  
##  Max.   :6.000   Max.   : 5.000   Max.   : 1500.0   Max.   :787.50  
##    total_amt        Store_type       
##  Min.   :-8270.9   Length:23053      
##  1st Qu.:  762.5   Class :character  
##  Median : 1754.7   Mode  :character  
##  Mean   : 2107.3                     
##  3rd Qu.: 3569.2                     
##  Max.   : 8287.5

Merge Operations

file2 <- read.csv("D:\\ISB 2021\\Foundational R\\Ashwani_Singh_assignment R\\Retail data set\\Customer.csv")
df= merge(file2,file1, by="customer_Id")
head(df)

##   customer_Id        DOB Gender city_code transaction_id  tran_date
## 1      266783 01-05-1974      M         4     8410316370 20-02-2013
## 2      266783 01-05-1974      M         4    98477711300 21-10-2012
## 3      266783 01-05-1974      M         4    25890929042 24-09-2011
## 4      266783 01-05-1974      M         4    16999552161   9/2/2013
## 5      266783 01-05-1974      M         4    25890929042 23-09-2011
## 6      266784 13-12-1991      F        10    36310127403  4/12/2012
##   prod_subcat_code prod_cat_code Qty  Rate     Tax total_amt     Store_type
## 1                4             1   1   869  91.245   960.245         e-Shop
## 2                4             1   3    93  29.295   308.295       TeleShop
## 3                1             2  -4 -1321 554.820 -5838.820         e-Shop
## 4               10             5   2   835 175.350  1845.350         e-Shop
## 5                1             2   4  1321 554.820  5838.820         e-Shop
## 6                4             3   2   200  42.000   442.000 Flagship store

Dealing with Missing Values

#Finding total missing values in the dataset
sum(is.na(file1))

## [1] 0

#Finding total missing values column wise 
colSums(is.na(file1))

##   transaction_id      customer_Id        tran_date prod_subcat_code 
##                0                0                0                0 
##    prod_cat_code              Qty             Rate              Tax 
##                0                0                0                0 
##        total_amt       Store_type 
##                0                0

#removes rows with any missing 
#na.omit(file1)

Plots

#Histogram is used to understand the distribution of a single variable
hist(file1$total_amt,main="Histogram of Total amount")

#Scatter plot is used to understand relation between two variable
plot(file1$Rate, file1$total_amt, main = "Scatterplot of Rate and Total amount", xlab = "Rate", ylab="Amount")

There are many ways one can use above functions. R functions mostly compliment each other. I have tried to show it through a few examples in the above notebook.

Thank You and I hope you liked my work