Load Packages

library(ggplot2)
library(gridExtra)
library(lubridate)

Let’s clean the global environment before moving further

rm(list=ls())
dev.off()

## null device 
##           1

cat("\014")

Data Types in R

Logical Data Type

x <- c(12,1,5,18,2,6,NA)
is.na(x)

## [1] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE

x[!is.na(x)]

## [1] 12  1  5 18  2  6

Integer and Double/Numeric Data Types

d <- c(1,2,3,4,5,6,11)
class(d)

## [1] "numeric"

We can see that by default, when you create a numeric vector using the c() function it will produce a vector of double precision numeric values (real numbers). How can we create a vector with integer values ??

You can establish this by either explicitly writing a “L” after each value

i <- c(1L,2L,3L,4L,5L,6L,7L,8L,9L,10L)
typeof(i)

## [1] "integer"

You can aslo use as.integer

f <-as.integer(d)
typeof(f)

## [1] "integer"

Character Data Types

w <-c("Alice","Bob","Charlie")
typeof(w)

## [1] "character"

length(w)

## [1] 3

nchar(w[1])

## [1] 5

nchar(w[2])

## [1] 3

as.character(f)

## [1] "1"  "2"  "3"  "4"  "5"  "6"  "11"

String manipulations in R

Any value written within a pair of single quote or double quotes in R is treated as a string. Internally R stores every string within double quotes, even when you create them with single quote.

a <- "Hey!" 
b <- "How"
c <- "are you?" 
paste(a,b,c)

## [1] "Hey! How are you?"

paste(a,b,c, sep = "-")

## [1] "Hey!-How-are you?"

print(paste(a,b,c, sep = "", collapse = ""))

## [1] "Hey!Howare you?"

toupper(a)

## [1] "HEY!"

substring(paste(b,c),3,8)

## [1] "w are "

You can look up the stringr package for further details https://github.com/rstudio/cheatsheets/raw/master/strings.pdf

String comparison in R

startsWith(string,pattern) tests starts of string

temp <-"Hello World!"
startsWith(temp,"He")

## [1] TRUE

startsWith(temp,"he")#sensitive to case

## [1] FALSE

endsWith(string,pattern) tests end of string

endsWith(temp,"!")

## [1] TRUE

grep(pattern, x) searches for pattern in x (returns 1 if true, 0 if false)

temp1<- "We are inconsequential cosmic dust, bumping and milling about on a tiny blue speck."
grep("dust",temp1)

## [1] 1

grep("a",temp1)

## [1] 1

sub(pattern, replacement, x) replace the pattern with replacement in x (first match, gsub for all matches)

sub("a","e",temp1)#Replace the first  a with e

## [1] "We ere inconsequential cosmic dust, bumping and milling about on a tiny blue speck."

gsub("a","e",temp1)#Replace all the a's with e's

## [1] "We ere inconsequentiel cosmic dust, bumping end milling ebout on e tiny blue speck."

Loading the data

wine <- read.csv("~/Downloads/WINE.csv")
head(wine)

##     PH ALCOHOL QUALITY   COUNTRY  PRICE RATE YEAR
## 1 3.57    10.2       5     Italy 145.65   93 1982
## 2 3.20     9.8       5 Argentina  17.12   53 2000
## 3 3.42    11.0       6 Kazahstan  63.31    8 2003
## 4 3.52    11.2       6 Argentina  89.87   82 2002
## 5 3.45    10.5       5 Argentina 128.45   93 2007
## 6 3.51     9.4       5 Kazahstan  20.38   66 1973

Factors & Manipulation

Conceptually, factors are referred to as categorical variables They are stored as a vector of integer values with a corresponding set of character values to use when the factor is displayed as.factor(x) converts x to a factor. is.factor(x) returns true if x is a factor, false otherwise.

table(wine$QUALITY)

## 
##   3   4   5   6   7   8 
##  48  84 727 685 249  76

Let’s see how this data is stored

x<-as.factor(wine$QUALITY)
str(x)

##  Factor w/ 6 levels "3","4","5","6",..: 3 3 4 4 3 3 4 4 4 4 ...

# Basic barplot
p<-ggplot(data=wine, aes(x=as.factor(wine$QUALITY), y=PRICE)) +theme_minimal()+
  geom_bar(stat="identity", fill = as.factor(wine$QUALITY))
p

Change the names of the categories

wine$QUALITY <- factor(wine$QUALITY)
wine$QUALITY <- factor(wine$QUALITY,labels=c("very bad","bad","fair","good","very good","perfect")) 
head(wine$QUALITY)

## [1] fair fair good good fair fair
## Levels: very bad bad fair good very good perfect

summary(wine$QUALITY)

##  very bad       bad      fair      good very good   perfect 
##        48        84       727       685       249        76

Deleting a category

wine$QUALITY <- factor(wine$QUALITY,exclude="perfect")
levels(wine$QUALITY)

## [1] "very bad"  "bad"       "fair"      "good"      "very good"

summary(wine$QUALITY)

##  very bad       bad      fair      good very good      NA's 
##        48        84       727       685       249        76

The value which originally belonged to quality=8 becomes NA now. To delete the NA values, we can use “na.omit()” function

Functions in R

A simple function to plot the histogram

plot_histogram<-function(x){
  p<-ggplot(data=data.frame(data=x),aes(x=data))+
    geom_histogram(aes(y=stat(ndensity)),fill= 'cyan',bins=40)+
    geom_density(aes(y=stat(scaled)),fill = "thistle",alpha = 0.4)
}

A simple fucntion explore that returns the mean, median, s.d and length of a vector

explore<-function(x){
  data<-c("Mean"=mean(x, na.rm=T),
          "Median"=median(x, na.rm =T), 
          "Standard Deviation" = sd(x, na.rm =T),
          "Length" = length(x))
  return(data)
}

Let’s create a function that convert dollars to euros

dollarsToEuro <- function(x){
 return (x/1.16)}

apply(x,index,function): Applying a function to the rows (index=1) or columns (index=2) of a matrix

Apply can only be used on dataframes

apply(wine, 2, min) #Returns the minimum value of all the columns

##          PH     ALCOHOL     QUALITY     COUNTRY       PRICE        RATE 
##      "2.74"     " 8.40"          NA "Argentina"    "  0.50"       "  0" 
##        YEAR 
##      "1960"

apply(wine, 2, max) #Returns the maximum value of all the columns

##           PH      ALCOHOL      QUALITY      COUNTRY        PRICE         RATE 
##       "4.01"      "14.90"           NA "Montenegro"     "909.00"        "100" 
##         YEAR 
##       "2014"

apply(wine, 2, explore) #Returns histogram for all the columns

##                    PH                  ALCOHOL            QUALITY COUNTRY
## Mean               NA                  NA                 NA      NA     
## Median             "3.32"              "10.40"            "good"  "Italy"
## Standard Deviation "0.219453892309641" "1.61372304659001" NA      NA     
## Length             "1869"              "1869"             "1869"  "1869" 
##                    PRICE              RATE               YEAR              
## Mean               NA                 NA                 NA                
## Median             "102.39"           " 51"              "1988"            
## Standard Deviation "92.6447493724239" "28.8186784435111" "15.8779019707066"
## Length             "1869"             "1869"             "1869"

lapply(x,function): Apply a function to each element of the list x

x<-lapply(wine[1:10,]$PRICE,dollarsToEuro) #Selecting only the first 10 rows
print(class(x))

## [1] "list"

print(unclass(x))

## [[1]]
## [1] 125.5603
## 
## [[2]]
## [1] 14.75862
## 
## [[3]]
## [1] 54.57759
## 
## [[4]]
## [1] 77.47414
## 
## [[5]]
## [1] 110.7328
## 
## [[6]]
## [1] 17.56897
## 
## [[7]]
## [1] 82.68103
## 
## [[8]]
## [1] 105.4569
## 
## [[9]]
## [1] 59.96552
## 
## [[10]]
## [1] 48.2069

As we can see that lapply creates a new list for each result.

sapply(x,function): Apply a function to each element of the list x with simplification of result

x<-sapply(wine[1:10,]$PRICE,dollarsToEuro) #Selecting only the first 10 rows
class(x)

## [1] "numeric"

unclass(x)

##  [1] 125.56034  14.75862  54.57759  77.47414 110.73276  17.56897  82.68103
##  [8] 105.45690  59.96552  48.20690

sapply creates a single list with all the results. Sapply and lapply are used on lists

tapply(x,y,function): Apply a function to subsets of a vector X and defined the subset by vector Y.

x<-tapply(wine$PRICE, wine$COUNTRY, mean)
x

##  Argentina  Australia California     France      Italy  Kazahstan Montenegro 
##  120.60300  145.12171  131.90437  138.02150  136.18953   72.35150   73.56176

class(x)

## [1] "array"

Recitation 06 - R Review and Data Formatting

Janish Parikh

10/11/2021