# This is how we can create vectors in R
x <-c(1,2,3,4)
y = c(5,6,7,8)
length(x)
## [1] 4
length(y)
## [1] 4
# Algebraic operations are coordinate wise
x+y
## [1] 6 8 10 12
x^2
## [1] 1 4 9 16
x^2+y^2
## [1] 26 40 58 80
# lists all active objects
ls()
## [1] "x" "y"
# removes indicated objects
rm(x,y)
ls()
## character(0)
# Creating sequences
x <- seq(from = 2, length = 5, by = 2)
x
## [1] 2 4 6 8 10
# We can ask R about a command
?seq
## starting httpd help server ... done
# This is how we access specific components
x[2]
## [1] 4
# This is another way to indicate a sequence
2:4
## [1] 2 3 4
# We can request a sequence of components from a vector
# x[2], x[3], and x[4]
x[2:4]
## [1] 4 6 8
# This is x without its second component
x[-2]
## [1] 2 6 8 10
# x without components 2, 3, and 4
x[-2:-4]
## [1] 2 10
# c stands for concatenate, this also creates a vector
c(2:4)
## [1] 2 3 4
# Here are different ways to remove components of x
x[c(-2:-4)]
## [1] 2 10
x[-c(2:4)]
## [1] 2 10
x[-c(2,3,4)]
## [1] 2 10
# This is how we can create matrices in R
# they are built by columns by default
x=matrix(data=c(1,2,3,4),nrow=2,ncol=2)
x
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
# or by rows if requested
# here we just list values of parameters, a better programming practice is
# to always name all parameters as above
y=matrix(c(5,6,7,8),2,2,byrow=T)
y
## [,1] [,2]
## [1,] 5 6
## [2,] 7 8
# x is organized by columns (column vectors), y by rows
# Algebraic operations are performed entrywise
z = sqrt(x)
z
## [,1] [,2]
## [1,] 1.000000 1.732051
## [2,] 1.414214 2.000000
z^2
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
################################
# Moving toward prob and stats #
################################
# This is how we can generate pseudo five random numbers
# from the standard normal distribution
x=rnorm(5)
x
## [1] -0.01623584 1.24861905 0.24260954 0.91034165 0.34329212
y=rnorm(5)
y
## [1] 1.0495602 1.3130091 0.1924788 0.9249057 0.8703098
# This is the basic plotting function in R
plot(x,y)

# Correlation
cor(x,y)
## [1] 0.5056599
# If we want to work with pseudo random numbers and make our
# analysis reproducible, we need to set the seed for the generator
set.seed(2023)
x=rnorm(5)
x
## [1] -0.08378436 -0.98294375 -1.87506732 -0.18614466 -0.63348570
set.seed(2023)
y=rnorm(5)
y
## [1] -0.08378436 -0.98294375 -1.87506732 -0.18614466 -0.63348570
# Same seed, same values
plot(x,y)

cor(x,y)
## [1] 1
# No seed is set
x=rnorm(50)
y=rnorm(50)
plot(x,y,xlab="Measurements errors machine 1",
ylab="Measurements errors machine 2",
main="Measurement errors", col="red")

# Little more about matrices
A=matrix(1:16,4,4)
A
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
## [3,] 3 7 11 15
## [4,] 4 8 12 16
A[2,3]
## [1] 10
A[c(1,3),c(2,4)]
## [,1] [,2]
## [1,] 5 13
## [2,] 7 15
A[1:3,2:4]
## [,1] [,2] [,3]
## [1,] 5 9 13
## [2,] 6 10 14
## [3,] 7 11 15
A[1,] # First row
## [1] 1 5 9 13
A[,1] # First column
## [1] 1 2 3 4
# Note that A[1,] and A[,1] are not matrices but vectors
# Sometimes we want to keep the matrix data type
A[,1,drop=FALSE]
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
A[1:2,]
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
A[,1:2]
## [,1] [,2]
## [1,] 1 5
## [2,] 2 6
## [3,] 3 7
## [4,] 4 8
A[-c(1,3),] # The negative sign eliminates indicated entries
## [,1] [,2] [,3] [,4]
## [1,] 2 6 10 14
## [2,] 4 8 12 16
# Dimention - number or rows and columns for a two dimensional matrix
dim(A)
## [1] 4 4
###################
# Analyzing Autos #
###################
library(readxl)
setwd("C:/Users/lgawarec/Documents/MATH-430/R files")
getwd()
## [1] "C:/Users/lgawarec/Documents/MATH-430/R files"
# The following commands are comments only because when
# compiling a report, there cannot be two different files
# Undo comments for presentation, then redo comments for compiling
# Autos <- read_excel("C:/Users/lgawarec/Documents/MATH-430/R files/Autosxlsx.xlsx")
# summary(Autos)
# View(Autos)
# dim(Autos)
# Something is wrong, why horsepower is a character variable?
# Replacing "?" with "NA" - Not Available (missing data)
Autos <- read_excel("C:/Users/lgawarec/Documents/MATH-430/R files/Autosxlsx.xlsx",
na="?" )
View(Autos)
summary(Autos)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.50 1st Qu.:4.000 1st Qu.:104.0 1st Qu.: 75.0 1st Qu.:2223
## Median :23.00 Median :4.000 Median :146.0 Median : 93.5 Median :2800
## Mean :23.52 Mean :5.458 Mean :193.5 Mean :104.5 Mean :2970
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:262.0 3rd Qu.:126.0 3rd Qu.:3609
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
## NA's :5
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 Length:397
## 1st Qu.:13.80 1st Qu.:73.00 1st Qu.:1.000 Class :character
## Median :15.50 Median :76.00 Median :1.000 Mode :character
## Mean :15.56 Mean :75.99 Mean :1.574
## 3rd Qu.:17.10 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :24.80 Max. :82.00 Max. :3.000
##
dim(Autos)
## [1] 397 9
# This command will produce "individual values plot" by Cylinders
plot(Autos$cylinders,Autos$mpg, xlab="Cylinders",ylab="mpg",
main="How cylinders affect mpg")

# To plot side-by-side box-plots, we convert "cylinders" from a numerical
# variable to a factor variable
cylinderQ=as.factor(Autos$cylinders)
head(cylinderQ)
## [1] 8 8 8 8 8 8
## Levels: 3 4 5 6 8
plot(cylinderQ,Autos$mpg,varwidth=T,xlab="Number of cylinders",
ylab="mpg",main="How cylinders affect mpg")

# Here is a way to interact with the graph
# win.graph()
# plot(Autos$horsepower,Autos$mpg)
# identify(Autos$horsepower,Autos$mpg, Autos$name, tolerance=0.5)
# IMPORTANT: from now on, we will use the following packages:
# "MASS" and "ISLR2"
# We will install the now
# install.packages("MASS")
# install.packages("ISLR2")
# From now on, start every R script with two commands:
library(MASS)
library(ISLR2)
##
## Attaching package: 'ISLR2'
##
## The following object is masked from 'package:MASS':
##
## Boston