# This is how we can create vectors in R
x <-c(1,2,3,4)
y = c(5,6,7,8)
length(x)
## [1] 4
length(y)
## [1] 4
# Algebraic operations are coordinate wise
x+y
## [1]  6  8 10 12
x^2
## [1]  1  4  9 16
x^2+y^2 
## [1] 26 40 58 80
# lists all active objects
ls()
## [1] "x" "y"
# removes indicated objects
rm(x,y)
ls()
## character(0)
# Creating sequences
x <- seq(from = 2, length = 5, by = 2)
x
## [1]  2  4  6  8 10
# We can ask R about a command
?seq
## starting httpd help server ... done
# This is how we access specific components
x[2]
## [1] 4
# This is another way to indicate a sequence
2:4
## [1] 2 3 4
# We can request a sequence of components from a vector
# x[2], x[3], and x[4]
x[2:4]
## [1] 4 6 8
# This is x without its second component
x[-2]
## [1]  2  6  8 10
# x without components 2, 3, and 4
x[-2:-4]
## [1]  2 10
# c stands for concatenate, this also creates a vector
c(2:4)
## [1] 2 3 4
# Here are different ways to remove components of x
x[c(-2:-4)]
## [1]  2 10
x[-c(2:4)]
## [1]  2 10
x[-c(2,3,4)]
## [1]  2 10
# This is how we can create matrices in R 
# they are built by columns by default
x=matrix(data=c(1,2,3,4),nrow=2,ncol=2)
x
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
# or by rows if requested
# here we just list values of parameters, a better programming practice is
# to always name all parameters as above
y=matrix(c(5,6,7,8),2,2,byrow=T)
y
##      [,1] [,2]
## [1,]    5    6
## [2,]    7    8
# x is organized by columns (column vectors), y by rows

# Algebraic operations are performed entrywise
z = sqrt(x)
z
##          [,1]     [,2]
## [1,] 1.000000 1.732051
## [2,] 1.414214 2.000000
z^2
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
################################
# Moving toward prob and stats #
################################

# This is how we can generate pseudo five random numbers 
# from the standard normal distribution
x=rnorm(5)
x
## [1] -0.01623584  1.24861905  0.24260954  0.91034165  0.34329212
y=rnorm(5)
y
## [1] 1.0495602 1.3130091 0.1924788 0.9249057 0.8703098
# This is the basic plotting function in R
plot(x,y)

# Correlation 
cor(x,y)
## [1] 0.5056599
# If we want to work with pseudo random numbers and make our
# analysis reproducible, we need to set the seed for the generator
set.seed(2023)
x=rnorm(5)
x
## [1] -0.08378436 -0.98294375 -1.87506732 -0.18614466 -0.63348570
set.seed(2023)
y=rnorm(5) 
y
## [1] -0.08378436 -0.98294375 -1.87506732 -0.18614466 -0.63348570
# Same seed, same values
plot(x,y)

cor(x,y)
## [1] 1
# No seed is set
x=rnorm(50)
y=rnorm(50)
plot(x,y,xlab="Measurements errors machine 1", 
     ylab="Measurements errors machine 2", 
     main="Measurement errors", col="red")

# Little more about matrices
A=matrix(1:16,4,4)
A
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
## [3,]    3    7   11   15
## [4,]    4    8   12   16
A[2,3]
## [1] 10
A[c(1,3),c(2,4)]
##      [,1] [,2]
## [1,]    5   13
## [2,]    7   15
A[1:3,2:4]
##      [,1] [,2] [,3]
## [1,]    5    9   13
## [2,]    6   10   14
## [3,]    7   11   15
A[1,] # First row
## [1]  1  5  9 13
A[,1] # First column
## [1] 1 2 3 4
# Note that A[1,] and A[,1] are not matrices but vectors
# Sometimes we want to keep the matrix data type
A[,1,drop=FALSE]
##      [,1]
## [1,]    1
## [2,]    2
## [3,]    3
## [4,]    4
A[1:2,]
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
A[,1:2]
##      [,1] [,2]
## [1,]    1    5
## [2,]    2    6
## [3,]    3    7
## [4,]    4    8
A[-c(1,3),] # The negative sign eliminates indicated entries
##      [,1] [,2] [,3] [,4]
## [1,]    2    6   10   14
## [2,]    4    8   12   16
# Dimention - number or rows and columns for a two dimensional matrix
dim(A)
## [1] 4 4
###################
# Analyzing Autos #
###################

library(readxl)
setwd("C:/Users/lgawarec/Documents/MATH-430/R files")
getwd()
## [1] "C:/Users/lgawarec/Documents/MATH-430/R files"
# The following commands are comments only because when
# compiling a report, there cannot be two different files
# Undo comments for presentation, then redo comments for compiling
# Autos <- read_excel("C:/Users/lgawarec/Documents/MATH-430/R files/Autosxlsx.xlsx")
# summary(Autos)
# View(Autos)
# dim(Autos)
# Something is wrong, why horsepower is a character variable?

# Replacing "?" with "NA" - Not Available (missing data)
Autos <- read_excel("C:/Users/lgawarec/Documents/MATH-430/R files/Autosxlsx.xlsx",
                    na="?" )
View(Autos)
summary(Autos)
##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.50   1st Qu.:4.000   1st Qu.:104.0   1st Qu.: 75.0   1st Qu.:2223  
##  Median :23.00   Median :4.000   Median :146.0   Median : 93.5   Median :2800  
##  Mean   :23.52   Mean   :5.458   Mean   :193.5   Mean   :104.5   Mean   :2970  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:262.0   3rd Qu.:126.0   3rd Qu.:3609  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                  NA's   :5                     
##   acceleration        year           origin          name          
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   Length:397        
##  1st Qu.:13.80   1st Qu.:73.00   1st Qu.:1.000   Class :character  
##  Median :15.50   Median :76.00   Median :1.000   Mode  :character  
##  Mean   :15.56   Mean   :75.99   Mean   :1.574                     
##  3rd Qu.:17.10   3rd Qu.:79.00   3rd Qu.:2.000                     
##  Max.   :24.80   Max.   :82.00   Max.   :3.000                     
## 
dim(Autos)
## [1] 397   9
# This command will produce "individual values plot" by Cylinders
plot(Autos$cylinders,Autos$mpg, xlab="Cylinders",ylab="mpg", 
     main="How cylinders affect mpg")

# To plot side-by-side box-plots, we convert "cylinders" from a numerical
# variable to a factor variable
cylinderQ=as.factor(Autos$cylinders)
head(cylinderQ)
## [1] 8 8 8 8 8 8
## Levels: 3 4 5 6 8
plot(cylinderQ,Autos$mpg,varwidth=T,xlab="Number of cylinders", 
     ylab="mpg",main="How cylinders affect mpg")

# Here is a way to interact with the graph
# win.graph()
# plot(Autos$horsepower,Autos$mpg)
# identify(Autos$horsepower,Autos$mpg, Autos$name, tolerance=0.5)

# IMPORTANT: from now on, we will use the following packages:
# "MASS" and "ISLR2"
# We will install the now
# install.packages("MASS")
# install.packages("ISLR2")

# From now on, start every R script with two commands:
library(MASS)
library(ISLR2)
## 
## Attaching package: 'ISLR2'
## 
## The following object is masked from 'package:MASS':
## 
##     Boston