Data Analysis using R

Vectors and Dataframes in R

———————————————————————–

############################################################
# Clean R Environment
############################################################
# Remove all objects from the workspace
rm(list = ls())

############################################################
### Set the working directory where your files are located
setwd("D:/D Drive/Ph.D. Course Work/PhD_2025/DataFile")
### Verify the current working directory
getwd()

## [1] "D:/D Drive/Ph.D. Course Work/PhD_2025/DataFile"

st = c("s1","s2","s3","s4")  # Character vector
marks = c(20.5, 25, 27, 21)  # Numeric vector
marks

## [1] 20.5 25.0 27.0 21.0

### R usually displays 7 significant digits for numeric values.
### Sequence of integers increment by 1
1:10

##  [1]  1  2  3  4  5  6  7  8  9 10

### Sequence with arbitrary increment
seq(from = 1, to = 20, by = 0.2)

##  [1]  1.0  1.2  1.4  1.6  1.8  2.0  2.2  2.4  2.6  2.8  3.0  3.2  3.4  3.6  3.8
## [16]  4.0  4.2  4.4  4.6  4.8  5.0  5.2  5.4  5.6  5.8  6.0  6.2  6.4  6.6  6.8
## [31]  7.0  7.2  7.4  7.6  7.8  8.0  8.2  8.4  8.6  8.8  9.0  9.2  9.4  9.6  9.8
## [46] 10.0 10.2 10.4 10.6 10.8 11.0 11.2 11.4 11.6 11.8 12.0 12.2 12.4 12.6 12.8
## [61] 13.0 13.2 13.4 13.6 13.8 14.0 14.2 14.4 14.6 14.8 15.0 15.2 15.4 15.6 15.8
## [76] 16.0 16.2 16.4 16.6 16.8 17.0 17.2 17.4 17.6 17.8 18.0 18.2 18.4 18.6 18.8
## [91] 19.0 19.2 19.4 19.6 19.8 20.0

###creates integer sequences with step   allows custom increments.
rep(c("A", "B"), 5) # repeats elements a specified number of times.

##  [1] "A" "B" "A" "B" "A" "B" "A" "B" "A" "B"

st[3]   # Access the 3rd element

## [1] "s3"

st[-3]  # Exclude the 3rd element

## [1] "s1" "s2" "s4"

###Types of Vectors
####### Numeric
####### Integer
####### Complex
####### Character
####### Logical


### Factor
### Factors are used for categorical data in R.
v = c(1, 1, 0, 0, 1, 1)
f1 = factor(v, levels = c(0,1),
            labels = c("good","bad"),
            ordered = TRUE)
f1

## [1] bad  bad  good good bad  bad 
## Levels: good < bad

levels(f1)  # Get all factor levels

## [1] "good" "bad"

nlevels(f1) # Number of levels

## [1] 2

### Factors store categorical data and can be ordered. Useful for statistical modeling.


as.numeric(f1)       # Convert factor to numeric

## [1] 2 2 1 1 2 2

as.character(marks)  # Convert numeric to character

## [1] "20.5" "25"   "27"   "21"

length(st)        # Number of elements

## [1] 4

st[length(st)]    # Last element using length

## [1] "s4"

#Numerical Operations on Vectors}

marks + 5   # Addition

## [1] 25.5 30.0 32.0 26.0

marks - 2   # Subtraction

## [1] 18.5 23.0 25.0 19.0

marks * 2   # Multiplication

## [1] 41 50 54 42

marks / 2   # Division

## [1] 10.25 12.50 13.50 10.50

marks ^ 2   # Exponentiation

## [1] 420.25 625.00 729.00 441.00

###Vector Summary Functions

min(marks)

## [1] 20.5

max(marks)

## [1] 27

sum(marks)

## [1] 93.5

prod(marks)

## [1] 290587.5

cumsum(marks)  # Cumulative sum

## [1] 20.5 45.5 72.5 93.5

###Logical Operators
x = c(TRUE, FALSE, TRUE)
!x                     # NOT

## [1] FALSE  TRUE FALSE

x & c(TRUE, TRUE, FALSE)  # AND

## [1]  TRUE FALSE FALSE

x | c(FALSE, TRUE, TRUE)  # OR

## [1] TRUE TRUE TRUE

marks > 22             # Comparison

## [1] FALSE  TRUE  TRUE FALSE

marks == 25            # Equality

## [1] FALSE  TRUE FALSE FALSE

###paste() combines strings with spaces or separators.
paste("Student", st, "has marks", marks)

## [1] "Student s1 has marks 20.5" "Student s2 has marks 25"  
## [3] "Student s3 has marks 27"   "Student s4 has marks 21"

sort(marks)   # Sort ascending

## [1] 20.5 21.0 25.0 27.0

rank(marks)   # Rank elements

## [1] 1 3 4 2

order(marks)  # Indices that would sort the vector

## [1] 1 4 2 3

###Data Frame

df = data.frame(Name=st, Marks=marks, Grade=c("A","B","A","C"))
df

##   Name Marks Grade
## 1   s1  20.5     A
## 2   s2  25.0     B
## 3   s3  27.0     A
## 4   s4  21.0     C

### Data frames store different types in columns but all columns must have the same length.

cars_df = data.frame(speed=c(4,7,12), dist=c(2,10,24))
cars_df

##   speed dist
## 1     4    2
## 2     7   10
## 3    12   24

###Data Frame Dimensions and Column Names}

nrow(df)     # Number of rows

## [1] 4

ncol(df)     # Number of columns

## [1] 3

dim(df)      # Rows x Columns

## [1] 4 3

names(df)    # Column names

## [1] "Name"  "Marks" "Grade"

dimnames(df) # Row and column names

## [[1]]
## [1] "1" "2" "3" "4"
## 
## [[2]]
## [1] "Name"  "Marks" "Grade"

###Subset of Data Frame
df[3, ]           # All variables for 3rd observation

##   Name Marks Grade
## 3   s3    27     A

df[, 2]           # 2nd variable for all observations

## [1] 20.5 25.0 27.0 21.0

df[c(1:5,12,15), ]  # Observations 1 to 5, 12, 15

##      Name Marks Grade
## 1      s1  20.5     A
## 2      s2  25.0     B
## 3      s3  27.0     A
## 4      s4  21.0     C
## NA   <NA>    NA  <NA>
## NA.1 <NA>    NA  <NA>
## NA.2 <NA>    NA  <NA>

df[-(10:20), ]       # Remove observations 10 to 20

##   Name Marks Grade
## 1   s1  20.5     A
## 2   s2  25.0     B
## 3   s3  27.0     A
## 4   s4  21.0     C

df[nrow(df), ]        # Last observation

##   Name Marks Grade
## 4   s4    21     C