# Imagine that you have a variable that records month:
# Create a character vector month
month <- c("Dec", "Apr", "Jan", "Mar")
class(month)
## [1] "character"
#>>Using a string to record this variable has two problems:
#(1)Sorting order
# It doesn’t sort in a useful way:
sort(month)
## [1] "Apr" "Dec" "Jan" "Mar"
#(2)Typos
# There are only twelve possible months,
# and there’s nothing saving you from typos:
monthwrong <- c("Dec", "Apr", "Jam", "Mar")
#Can you identify the mistake
# You can fix both of these problems with a factor.
# To create a factor you must start by creating a list of the valid levels:
monthlevels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
# Now you can create a factor:
monthf=factor(month)
print(month)
## [1] "Dec" "Apr" "Jan" "Mar"
monthfak=as.factor(month)
print(monthfak)
## [1] Dec Apr Jan Mar
## Levels: Apr Dec Jan Mar
monthfac=factor(month,monthlevels)
print(monthfac)
## [1] Dec Apr Jan Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
class(monthfac)
## [1] "factor"
# Summary of the factor
summary(monthfac)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1 0 1 1 0 0 0 0 0 0 0 1
summary(1:9)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 3 5 5 7 9
# Now let us sort the data
sort(monthfac)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# Thus display character vectors in a non-alphabetical order.
# Now let us see how it handles mistake
monthwrong <- c("Dec", "Apr", "Jam", "Mar")
monthfacw=factor(monthwrong,monthlevels)
print(monthfacw)
## [1] Dec Apr <NA> Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
any(is.na(monthfacw))
## [1] TRUE
anyNA(monthfacw)
## [1] TRUE
is.na(monthfacw)
## [1] FALSE FALSE TRUE FALSE
#any values not in the set will be silently converted to NA
# Their values can only be one of the predefined levels or NA.
# create a gender vector
gender <- c("Male","Female","Female","Male","Female")
unique(gender)
## [1] "Male" "Female"
# create a gender vector without the levels argument
genderfact=factor(gender)
print(genderfact)
## [1] Male Female Female Male Female
## Levels: Female Male
factor(month)
## [1] Dec Apr Jan Mar
## Levels: Apr Dec Jan Mar
# Creating a factor with levels defined
genderf <- factor(c("female", "male", "male", "female"),
levels = c("female", "transgender", "male"));
print(genderf)
## [1] female male male female
## Levels: female transgender male
#Check if a variable is a factor
class(genderf)
## [1] "factor"
is.factor(genderf)
## [1] TRUE
str(genderf)
## Factor w/ 3 levels "female","transgender",..: 1 3 3 1
#Advantage of storage
mode(monthfac)
## [1] "numeric"
storage.mode(monthfac)
## [1] "integer"
# factor(x = character(), levels, labels = levels, exclude = NA, ordered = is.ordered(x), nmax = NA)
# Properties of factor- Just like vector or character vector
tsize <- c("XL","L","XL","S","XXL","L","XL","M")
tsizef<-factor(tsize)
tsizefl<-factor(tsize,levels = c("S","M","L","XL","XXL"))
print(tsize)
## [1] "XL" "L" "XL" "S" "XXL" "L" "XL" "M"
print(tsizef)
## [1] XL L XL S XXL L XL M
## Levels: L M S XL XXL
print(tsizefl)
## [1] XL L XL S XXL L XL M
## Levels: S M L XL XXL
sort(tsize)
## [1] "L" "L" "M" "S" "XL" "XL" "XL" "XXL"
sort(tsizef)
## [1] L L M S XL XL XL XXL
## Levels: L M S XL XXL
sort(tsizefl)
## [1] S M L L XL XL XL XXL
## Levels: S M L XL XXL
print(tsizefl[3])
## [1] XL
## Levels: S M L XL XXL
print(tsizefl[5])
## [1] XXL
## Levels: S M L XL XXL
tsizefl[5]>tsizefl[3]
## Warning in Ops.factor(tsizefl[5], tsizefl[3]): '>' not meaningful for factors
## [1] NA
#Properties of factors : Character stored as integer
print(tsizefl)
## [1] XL L XL S XXL L XL M
## Levels: S M L XL XXL
length(tsizefl)
## [1] 8
table(tsizefl)
## tsizefl
## S M L XL XXL
## 1 1 2 3 1
unclass(tsizefl)
## [1] 4 3 4 1 5 3 4 2
## attr(,"levels")
## [1] "S" "M" "L" "XL" "XXL"
# Properties of factor- Levels
attributes(tsizefl)
## $levels
## [1] "S" "M" "L" "XL" "XXL"
##
## $class
## [1] "factor"
levels(tsizefl)
## [1] "S" "M" "L" "XL" "XXL"
class(tsizefl)
## [1] "factor"
#Properties of factors: They can have order
print(tsizefl)
## [1] XL L XL S XXL L XL M
## Levels: S M L XL XXL
is.ordered(tsizefl)
## [1] FALSE
tsizeflo<-factor(tsize,ordered = TRUE,levels = c("S","M","L","XL","XXL"))
print(tsizeflo)
## [1] XL L XL S XXL L XL M
## Levels: S < M < L < XL < XXL
is.ordered(tsizeflo)
## [1] TRUE
sort(tsizeflo)
## [1] S M L L XL XL XL XXL
## Levels: S < M < L < XL < XXL
print(tsizeflo[3])
## [1] XL
## Levels: S < M < L < XL < XXL
print(tsizeflo[5])
## [1] XXL
## Levels: S < M < L < XL < XXL
tsizeflo[5]>tsizeflo[3]
## [1] TRUE
# Convert unordered factors to ordered factors
as.ordered(tsizefl)
## [1] XL L XL S XXL L XL M
## Levels: S < M < L < XL < XXL
as.ordered(tsizef)
## [1] XL L XL S XXL L XL M
## Levels: L < M < S < XL < XXL
# Advantage of factors in plotting
#plot(tsize) # cannot plot a character vector
plot(tsizef) # can plot but the sequence is alphabet

plot(tsizefl) # can plot with desired sequence

plot(tsizeflo)# can plot with desired sequence
# Creating a factor with levels defined
genderflx <- factor(c("F", "M", "M", "F"),
levels = c("F", "T", "M"));
print(genderflx)
## [1] F M M F
## Levels: F T M
levels(genderflx)
## [1] "F" "T" "M"
genderflab <- factor(c("F", "M", "M", "F"),
levels = c("F", "T", "M"),
labels = c("Female","Transgender","Male"));
print(genderflx)
## [1] F M M F
## Levels: F T M
print(genderflab)
## [1] Female Male Male Female
## Levels: Female Transgender Male
unclass(genderflx)
## [1] 1 3 3 1
## attr(,"levels")
## [1] "F" "T" "M"
unclass(genderflab)
## [1] 1 3 3 1
## attr(,"levels")
## [1] "Female" "Transgender" "Male"
levels(genderflx)
## [1] "F" "T" "M"
levels(genderflab)
## [1] "Female" "Transgender" "Male"