library(ggplot2)
library(gridExtra)
library(lubridate)
Let’s clean the global environment before moving further
rm(list=ls())
dev.off()
## null device
## 1
cat("\014")
x <- c(12,1,5,18,2,6,NA)
is.na(x)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE
x[!is.na(x)]
## [1] 12 1 5 18 2 6
d <- c(1,2,3,4,5,6,11)
class(d)
## [1] "numeric"
We can see that by default, when you create a numeric vector using the c() function it will produce a vector of double precision numeric values (real numbers). How can we create a vector with integer values ??
You can establish this by either explicitly writing a “L” after each value
i <- c(1L,2L,3L,4L,5L,6L,7L,8L,9L,10L)
typeof(i)
## [1] "integer"
You can aslo use as.integer
f <-as.integer(d)
typeof(f)
## [1] "integer"
w <-c("Alice","Bob","Charlie")
typeof(w)
## [1] "character"
length(w)
## [1] 3
nchar(w[1])
## [1] 5
nchar(w[2])
## [1] 3
as.character(f)
## [1] "1" "2" "3" "4" "5" "6" "11"
Any value written within a pair of single quote or double quotes in R is treated as a string. Internally R stores every string within double quotes, even when you create them with single quote.
a <- "Hey!"
b <- "How"
c <- "are you?"
paste(a,b,c)
## [1] "Hey! How are you?"
paste(a,b,c, sep = "-")
## [1] "Hey!-How-are you?"
print(paste(a,b,c, sep = "", collapse = ""))
## [1] "Hey!Howare you?"
toupper(a)
## [1] "HEY!"
substring(paste(b,c),3,8)
## [1] "w are "
You can look up the stringr package for further details https://github.com/rstudio/cheatsheets/raw/master/strings.pdf
startsWith(string,pattern) tests starts of string
temp <-"Hello World!"
startsWith(temp,"He")
## [1] TRUE
startsWith(temp,"he")#sensitive to case
## [1] FALSE
endsWith(string,pattern) tests end of string
endsWith(temp,"!")
## [1] TRUE
grep(pattern, x) searches for pattern in x (returns 1 if true, 0 if false)
temp1<- "We are inconsequential cosmic dust, bumping and milling about on a tiny blue speck."
grep("dust",temp1)
## [1] 1
grep("a",temp1)
## [1] 1
sub(pattern, replacement, x) replace the pattern with replacement in x (first match, gsub for all matches)
sub("a","e",temp1)#Replace the first a with e
## [1] "We ere inconsequential cosmic dust, bumping and milling about on a tiny blue speck."
gsub("a","e",temp1)#Replace all the a's with e's
## [1] "We ere inconsequentiel cosmic dust, bumping end milling ebout on e tiny blue speck."
wine <- read.csv("~/Downloads/WINE.csv")
head(wine)
## PH ALCOHOL QUALITY COUNTRY PRICE RATE YEAR
## 1 3.57 10.2 5 Italy 145.65 93 1982
## 2 3.20 9.8 5 Argentina 17.12 53 2000
## 3 3.42 11.0 6 Kazahstan 63.31 8 2003
## 4 3.52 11.2 6 Argentina 89.87 82 2002
## 5 3.45 10.5 5 Argentina 128.45 93 2007
## 6 3.51 9.4 5 Kazahstan 20.38 66 1973
Conceptually, factors are referred to as categorical variables They are stored as a vector of integer values with a corresponding set of character values to use when the factor is displayed as.factor(x) converts x to a factor. is.factor(x) returns true if x is a factor, false otherwise.
table(wine$QUALITY)
##
## 3 4 5 6 7 8
## 48 84 727 685 249 76
Let’s see how this data is stored
x<-as.factor(wine$QUALITY)
str(x)
## Factor w/ 6 levels "3","4","5","6",..: 3 3 4 4 3 3 4 4 4 4 ...
# Basic barplot
p<-ggplot(data=wine, aes(x=as.factor(wine$QUALITY), y=PRICE)) +theme_minimal()+
geom_bar(stat="identity", fill = as.factor(wine$QUALITY))
p
wine$QUALITY <- factor(wine$QUALITY)
wine$QUALITY <- factor(wine$QUALITY,labels=c("very bad","bad","fair","good","very good","perfect"))
head(wine$QUALITY)
## [1] fair fair good good fair fair
## Levels: very bad bad fair good very good perfect
summary(wine$QUALITY)
## very bad bad fair good very good perfect
## 48 84 727 685 249 76
wine$QUALITY <- factor(wine$QUALITY,exclude="perfect")
levels(wine$QUALITY)
## [1] "very bad" "bad" "fair" "good" "very good"
summary(wine$QUALITY)
## very bad bad fair good very good NA's
## 48 84 727 685 249 76
The value which originally belonged to quality=8 becomes NA now. To delete the NA values, we can use “na.omit()” function
A simple function to plot the histogram
plot_histogram<-function(x){
p<-ggplot(data=data.frame(data=x),aes(x=data))+
geom_histogram(aes(y=stat(ndensity)),fill= 'cyan',bins=40)+
geom_density(aes(y=stat(scaled)),fill = "thistle",alpha = 0.4)
}
A simple fucntion explore that returns the mean, median, s.d and length of a vector
explore<-function(x){
data<-c("Mean"=mean(x, na.rm=T),
"Median"=median(x, na.rm =T),
"Standard Deviation" = sd(x, na.rm =T),
"Length" = length(x))
return(data)
}
Let’s create a function that convert dollars to euros
dollarsToEuro <- function(x){
return (x/1.16)}
Apply can only be used on dataframes
apply(wine, 2, min) #Returns the minimum value of all the columns
## PH ALCOHOL QUALITY COUNTRY PRICE RATE
## "2.74" " 8.40" NA "Argentina" " 0.50" " 0"
## YEAR
## "1960"
apply(wine, 2, max) #Returns the maximum value of all the columns
## PH ALCOHOL QUALITY COUNTRY PRICE RATE
## "4.01" "14.90" NA "Montenegro" "909.00" "100"
## YEAR
## "2014"
apply(wine, 2, explore) #Returns histogram for all the columns
## PH ALCOHOL QUALITY COUNTRY
## Mean NA NA NA NA
## Median "3.32" "10.40" "good" "Italy"
## Standard Deviation "0.219453892309641" "1.61372304659001" NA NA
## Length "1869" "1869" "1869" "1869"
## PRICE RATE YEAR
## Mean NA NA NA
## Median "102.39" " 51" "1988"
## Standard Deviation "92.6447493724239" "28.8186784435111" "15.8779019707066"
## Length "1869" "1869" "1869"
x<-lapply(wine[1:10,]$PRICE,dollarsToEuro) #Selecting only the first 10 rows
print(class(x))
## [1] "list"
print(unclass(x))
## [[1]]
## [1] 125.5603
##
## [[2]]
## [1] 14.75862
##
## [[3]]
## [1] 54.57759
##
## [[4]]
## [1] 77.47414
##
## [[5]]
## [1] 110.7328
##
## [[6]]
## [1] 17.56897
##
## [[7]]
## [1] 82.68103
##
## [[8]]
## [1] 105.4569
##
## [[9]]
## [1] 59.96552
##
## [[10]]
## [1] 48.2069
As we can see that lapply creates a new list for each result.
x<-sapply(wine[1:10,]$PRICE,dollarsToEuro) #Selecting only the first 10 rows
class(x)
## [1] "numeric"
unclass(x)
## [1] 125.56034 14.75862 54.57759 77.47414 110.73276 17.56897 82.68103
## [8] 105.45690 59.96552 48.20690
sapply creates a single list with all the results. Sapply and lapply are used on lists
x<-tapply(wine$PRICE, wine$COUNTRY, mean)
x
## Argentina Australia California France Italy Kazahstan Montenegro
## 120.60300 145.12171 131.90437 138.02150 136.18953 72.35150 73.56176
class(x)
## [1] "array"